AlfTetzlaff/nbodies.jl

## nbodies.jl
using CUDAnative
using CuArrays
using StaticArrays
using BenchmarkTools
using Makie
using Test

@inline function lennard_jones(dr, ϵ::T, σ::T) where T
    invr2 = inv(sum(abs2, dr) + T(1E-7))
    six_term = (σ ^ 2 * invr2) ^ 3
    f = min(((24 * ϵ) * invr2) * (2 * six_term ^ 2 - six_term), T(100))
    return f
end

@inline function pairwise_acceleration!(a, coords, i, j, f::Function)
    dr = coords[j] - coords[i]
    acc = f(dr) * dr
    a[i] -= acc
    a[j] += acc
    return nothing
end

function accelerations(coords; ϵ=0.2, σ=1.)
    N = length(coords)
    a = zero(coords)
    lj(dr) = lennard_jones(dr, ϵ, σ)
    @inbounds for i in 1:N-1
        for j in i+1:N
            pairwise_acceleration!(a, coords, i, j, lj)
        end
    end
    return a
end

function velocity_verlet(du0, u0, steps, dt, f::Function)
    u = copy(u0)
    du = copy(du0)
    u_tmp = similar(u)
    du_tmp = similar(du)
    for _ in 1:steps
        a = f(u)
        u_tmp .= u + du*dt + 0.5a*dt^2
        du .= du + 0.5(a + f(u_tmp))*dt
        u .= u_tmp
    end
    return du, u
end

function calculate_forces!(r::CuDeviceVector{T}, a::CuDeviceVector{T}) where {T}
    N = length(r)
    gtid = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # global thread id
    shared = @cuStaticSharedMem(T, 128)
    tile = 0
    pos = r[gtid]
    acc = zero(T)
    for i in 1:blockDim().x:N
        idx = tile * blockDim().x + threadIdx().x
        shared[threadIdx().x] = r[idx]
        sync_threads()

        @inbounds for j in 1:blockDim().x
            dr = shared[j]-pos
            acc -= lennard_jones(dr, 2f0, 1f0)*dr
        end

        sync_threads()
        tile += 1
    end
    a[gtid] = acc
    return nothing
end

function accelerations(coords::CuVector; ϵ=0.2f0, σ=1f0, nthreads=128)
    N = length(coords)
    if mod(N, nthreads) != 0
        raise ArgumentError("Number of particles has to be a multiple of the number of threads.")
    end
    a_d = CuArray(zeros(SVector{3,Float32}, N))  # not optimal, since copying!
    nblocks = N ÷ nthreads
    CuArrays.@sync @cuda blocks=nblocks threads=nthreads calculate_forces!(coords, a_d)
    return a_d
end

function main(scene, steps)

    N = 4096*2*2
    r0 = [SVector(50*rand(Float32, 3)...) for _ in 1:N]
    r0_d = CuVector(r0)
    v0 = zeros(SVector{3,Float32}, N)
    v0_d = CuVector(v0)

    res = accelerations(r0; σ=1f0, ϵ=2f0)
    res_d = Array(accelerations(r0_d))
    @test isapprox(res, res_d, rtol=1E-6)

    du, u = v0_d, r0_d  # Toggle between GPU and CPU
    # du, u = v0, r0

    img = meshscatter!(scene, Array(u), markersize=0.2)[end]
    cameracontrols(scene).rotationspeed[] = 0.01

    for t in 1:steps
        if !isopen(scene)
            break
        end
        # t0 = time_ns()
        du, u = velocity_verlet(du, u, 1, 0.001f0, accelerations)
        # r0, v0 = velocity_verlet(v0, r0, 1, 0.001f0, accelerations)
        img[1] = Array(u)
        # t1 = time_ns()
        # println(string(round(1. /(Float64(t1-t0)/1E9), digits=1), " FPS"))
        yield()
    end
end

scene = Scene()
main(scene, 1E4)


###################################
# A little more sophisticated kernel which is applicable to an arbitrary number of atoms, accepts any force function
# and has a variable shared memory size / number of threads. To be used with the acceleration function at the bottom.
function nbodykernel!(r::CuDeviceVector{T}, a::CuDeviceVector{T}, f::Function, ::Val{TH}) where {T,TH}
    N = length(r)
    tid = threadIdx().x
    gtid = (blockIdx().x - 1) * blockDim().x + tid  # global thread id
    shared = @cuStaticSharedMem(T, TH)

    full_blocks = N ÷ blockDim().x
    rest = N % blockDim().x

    @inbounds begin
        if gtid <= N
            pos = r[gtid]
        else
            pos = zero(T)
        end
        acc = zero(T)

        tile = 0
        for i in 1:full_blocks
            idx = tile * blockDim().x + tid
            shared[tid] = r[idx]
            sync_threads()
            for j in 1:blockDim().x
                dr = shared[j]-pos
                acc -= f(dr)*dr
            end
            sync_threads()
            tile += 1
        end

        if tid <= rest
            idx = tile * blockDim().x + tid
            shared[tid] = r[idx]
        end
        sync_threads()

        for j in 1:rest
            dr = shared[j]-pos
            acc -= f(dr)*dr
        end
        sync_threads()

        if gtid <= N
            a[gtid] = acc
        end
    end #inbounds
    return nothing
end

function accelerations(coords::CuVector; ϵ=2f0, σ=1f0, nthreads=128)
    N = length(coords)
    a_d = similar(coords)
    @inline g(x) = lennard_jones(x, ϵ, σ)
    CuArrays.@sync @cuda blocks=ceil(Int, N/nthreads) threads=nthreads nbodykernel!(coords, a_d, g, Val(nthreads))
    return a_d
end
	using CUDAnative
	using CuArrays
	using StaticArrays
	using BenchmarkTools
	using Makie
	using Test

	@inline function lennard_jones(dr, ϵ::T, σ::T) where T
	invr2 = inv(sum(abs2, dr) + T(1E-7))
	six_term = (σ ^ 2 * invr2) ^ 3
	f = min(((24 * ϵ) * invr2) * (2 * six_term ^ 2 - six_term), T(100))
	return f
	end

	@inline function pairwise_acceleration!(a, coords, i, j, f::Function)
	dr = coords[j] - coords[i]
	acc = f(dr) * dr
	a[i] -= acc
	a[j] += acc
	return nothing
	end

	function accelerations(coords; ϵ=0.2, σ=1.)
	N = length(coords)
	a = zero(coords)
	lj(dr) = lennard_jones(dr, ϵ, σ)
	@inbounds for i in 1:N-1
	for j in i+1:N
	pairwise_acceleration!(a, coords, i, j, lj)
	end
	end
	return a
	end

	function velocity_verlet(du0, u0, steps, dt, f::Function)
	u = copy(u0)
	du = copy(du0)
	u_tmp = similar(u)
	du_tmp = similar(du)
	for _ in 1:steps
	a = f(u)
	u_tmp .= u + dudt + 0.5adt^2
	du .= du + 0.5(a + f(u_tmp))*dt
	u .= u_tmp
	end
	return du, u
	end

	function calculate_forces!(r::CuDeviceVector{T}, a::CuDeviceVector{T}) where {T}
	N = length(r)
	gtid = (blockIdx().x - 1) * blockDim().x + threadIdx().x # global thread id
	shared = @cuStaticSharedMem(T, 128)
	tile = 0
	pos = r[gtid]
	acc = zero(T)
	for i in 1:blockDim().x:N
	idx = tile * blockDim().x + threadIdx().x
	shared[threadIdx().x] = r[idx]
	sync_threads()

	@inbounds for j in 1:blockDim().x
	dr = shared[j]-pos
	acc -= lennard_jones(dr, 2f0, 1f0)*dr
	end

	sync_threads()
	tile += 1
	end
	a[gtid] = acc
	return nothing
	end

	function accelerations(coords::CuVector; ϵ=0.2f0, σ=1f0, nthreads=128)
	N = length(coords)
	if mod(N, nthreads) != 0
	raise ArgumentError("Number of particles has to be a multiple of the number of threads.")
	end
	a_d = CuArray(zeros(SVector{3,Float32}, N)) # not optimal, since copying!
	nblocks = N ÷ nthreads
	CuArrays.@sync @cuda blocks=nblocks threads=nthreads calculate_forces!(coords, a_d)
	return a_d
	end

	function main(scene, steps)

	N = 409622
	r0 = [SVector(50*rand(Float32, 3)...) for _ in 1:N]
	r0_d = CuVector(r0)
	v0 = zeros(SVector{3,Float32}, N)
	v0_d = CuVector(v0)

	res = accelerations(r0; σ=1f0, ϵ=2f0)
	res_d = Array(accelerations(r0_d))
	@test isapprox(res, res_d, rtol=1E-6)

	du, u = v0_d, r0_d # Toggle between GPU and CPU
	# du, u = v0, r0

	img = meshscatter!(scene, Array(u), markersize=0.2)[end]
	cameracontrols(scene).rotationspeed[] = 0.01

	for t in 1:steps
	if !isopen(scene)
	break
	end
	# t0 = time_ns()
	du, u = velocity_verlet(du, u, 1, 0.001f0, accelerations)
	# r0, v0 = velocity_verlet(v0, r0, 1, 0.001f0, accelerations)
	img[1] = Array(u)
	# t1 = time_ns()
	# println(string(round(1. /(Float64(t1-t0)/1E9), digits=1), " FPS"))
	yield()
	end
	end

	scene = Scene()
	main(scene, 1E4)


	###################################
	# A little more sophisticated kernel which is applicable to an arbitrary number of atoms, accepts any force function
	# and has a variable shared memory size / number of threads. To be used with the acceleration function at the bottom.
	function nbodykernel!(r::CuDeviceVector{T}, a::CuDeviceVector{T}, f::Function, ::Val{TH}) where {T,TH}
	N = length(r)
	tid = threadIdx().x
	gtid = (blockIdx().x - 1) * blockDim().x + tid # global thread id
	shared = @cuStaticSharedMem(T, TH)

	full_blocks = N ÷ blockDim().x
	rest = N % blockDim().x

	@inbounds begin
	if gtid <= N
	pos = r[gtid]
	else
	pos = zero(T)
	end
	acc = zero(T)

	tile = 0
	for i in 1:full_blocks
	idx = tile * blockDim().x + tid
	shared[tid] = r[idx]
	sync_threads()
	for j in 1:blockDim().x
	dr = shared[j]-pos
	acc -= f(dr)*dr
	end
	sync_threads()
	tile += 1
	end

	if tid <= rest
	idx = tile * blockDim().x + tid
	shared[tid] = r[idx]
	end
	sync_threads()

	for j in 1:rest
	dr = shared[j]-pos
	acc -= f(dr)*dr
	end
	sync_threads()

	if gtid <= N
	a[gtid] = acc
	end
	end #inbounds
	return nothing
	end

	function accelerations(coords::CuVector; ϵ=2f0, σ=1f0, nthreads=128)
	N = length(coords)
	a_d = similar(coords)
	@inline g(x) = lennard_jones(x, ϵ, σ)
	CuArrays.@sync @cuda blocks=ceil(Int, N/nthreads) threads=nthreads nbodykernel!(coords, a_d, g, Val(nthreads))
	return a_d
	end