Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
using Pkg
using LinearAlgebra
using StaticArrays
using GPUifyLoops
@static if haskey(Pkg.installed(), "CuArrays")
using CUDAdrv
using CUDAnative
using CuArrays
CuArrays.allowscalar(false)
device = CUDA()
else
device = CPU()
end
@noinline function getstuff(arr, sym)
if sym == :a
SVector(1.0,2.0,3.0)
elseif sym == :b
SVector(4.0,5.0,6.0)
else
SVector(7.0,8.0,9.0)
end
end
function kernel!()
A = MArray{Tuple{3, 3}, Float64}(undef)
@inbounds begin
@loop for i in (1:5; threadIdx().x)
A[:,3] = getstuff(A, :a)
end
@synchronize
end
nothing
end
function foo(device, n)
for i = 1:n
@launch(device, threads=(5,5,1), blocks=1000,
kernel!())
end
end
foo(device, 1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment