asinghvi17/metal_try.jl

## metal_try.jl
using BenchmarkTools, Metal
using Makie, CairoMakie
using RecurrenceAnalysis

function recurrence_kernel!(output_matrix, x1, x2)
    x, y = Metal.thread_range()

end


# Create some Metal arrays with data - should have length 10^5
f1 = sin.(LinRange{Float32}(0, 10π, 10^4)) |> Metal.MtlArray
f2 = sin.(LinRange{Float32}(0, 10π, 10^4)) |> Metal.MtlArray

euclidean_distance(x::Real, y::Real) = sqrt(x^2 + y^2)

distmat_gpu = euclidean_distance.(f1, f2')
distmat_gpu_to_cpu = Array(euclidean_distance.(f1, f2'))
# The generic abstractarray recipe applied to GPUArrays in Makie is extremely slow,
# so we have to actually define a specific recipe for them here:

Makie.convert_single_argument(arr::Metal.GPUArraysCore.AbstractGPUArray{T, N}) where {T, N} = Array{T, N}(arr)
Makie.convert_arguments(PT::Type{<: Makie.AbstractPlot}, arr::Metal.GPUArraysCore.AbstractGPUArray{T, N}) where {T, N} = Makie.convert_arguments(PT, Array{T, N}(arr))

heatmap(distmat_gpu_to_cpu)

@benchmark RecurrenceAnalysis.RecurrenceMatrix($(Vector(f1)), 0.1)

@benchmark euclidean_distance.($(Vector(f1)), $(Vector(f2))')
# BenchmarkTools.Trial: 126 samples with 1 evaluation.
#  Range (min … max):  33.505 ms … 45.455 ms  ┊ GC (min … max):  1.91% … 15.44%
#  Time  (median):     39.998 ms              ┊ GC (median):    14.65%
#  Time  (mean ± σ):   39.956 ms ±  1.905 ms  ┊ GC (mean ± σ):  14.41% ±  1.77%
#
#                                  ▃ ▄█▄ ▃▂
#   ▃▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▇▃▇▃▃▄▃▇▆▇▄▄█▆███▆███▇▆▆▁▁▆▃▃▁▄▁▆▁▁▁▃▁▁▇ ▃
#   33.5 ms         Histogram: frequency by time        44.6 ms <
#
#  Memory estimate: 381.47 MiB, allocs estimate: 2.

@benchmark Metal.@sync(euclidean_distance.($(f1), $(f2')))
# BenchmarkTools.Trial: 124 samples with 1 evaluation.
#  Range (min … max):   88.542 μs …    4.701 s  ┊ GC (min … max): 0.00% … 0.00%
#  Time  (median):     138.270 μs               ┊ GC (median):    0.00%
#  Time  (mean ± σ):    49.261 ms ± 422.128 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
#
#   █  █
#   █▁▁█▁▁▄▁▁▁▁▁▁▁▄▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▄
#   88.5 μs       Histogram: log(frequency) by time        302 ms <
#
#  Memory estimate: 6.73 KiB, allocs estimate: 272.

# This shows the superiority of GPU processing pretty easily - 1.3ms vs 39ms is fast indeed.

# Here's the problem: the GPU only has so much memory, and quite a bit of it is already in use
# for your display.  In addition, special matrix types like `Symmetric`, which halves the computational
# and memory burden, are not actually available on the GPU.

# How do we fix this?

# One possibility is to provide a specific tiling format to the GPU, specifying that
# only certain blocks within the matrix should actually be computed.  This should provide
# most of the benefit of the tiling with only a bit of waste above the diagonal,
# if most computation is below it.

# However, this needs a good knowledge of how many threads are being used and how the architecture of the GPU works.
	using BenchmarkTools, Metal
	using Makie, CairoMakie
	using RecurrenceAnalysis

	function recurrence_kernel!(output_matrix, x1, x2)
	x, y = Metal.thread_range()

	end


	# Create some Metal arrays with data - should have length 10^5
	f1 = sin.(LinRange{Float32}(0, 10π, 10^4)) \|> Metal.MtlArray
	f2 = sin.(LinRange{Float32}(0, 10π, 10^4)) \|> Metal.MtlArray

	euclidean_distance(x::Real, y::Real) = sqrt(x^2 + y^2)

	distmat_gpu = euclidean_distance.(f1, f2')
	distmat_gpu_to_cpu = Array(euclidean_distance.(f1, f2'))
	# The generic abstractarray recipe applied to GPUArrays in Makie is extremely slow,
	# so we have to actually define a specific recipe for them here:

	Makie.convert_single_argument(arr::Metal.GPUArraysCore.AbstractGPUArray{T, N}) where {T, N} = Array{T, N}(arr)
	Makie.convert_arguments(PT::Type{<: Makie.AbstractPlot}, arr::Metal.GPUArraysCore.AbstractGPUArray{T, N}) where {T, N} = Makie.convert_arguments(PT, Array{T, N}(arr))

	heatmap(distmat_gpu_to_cpu)

	@benchmark RecurrenceAnalysis.RecurrenceMatrix($(Vector(f1)), 0.1)

	@benchmark euclidean_distance.($(Vector(f1)), $(Vector(f2))')
	# BenchmarkTools.Trial: 126 samples with 1 evaluation.
	# Range (min … max): 33.505 ms … 45.455 ms ┊ GC (min … max): 1.91% … 15.44%
	# Time (median): 39.998 ms ┊ GC (median): 14.65%
	# Time (mean ± σ): 39.956 ms ± 1.905 ms ┊ GC (mean ± σ): 14.41% ± 1.77%
	#
	# ▃ ▄█▄ ▃▂
	# ▃▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▇▃▇▃▃▄▃▇▆▇▄▄█▆███▆███▇▆▆▁▁▆▃▃▁▄▁▆▁▁▁▃▁▁▇ ▃
	# 33.5 ms Histogram: frequency by time 44.6 ms <
	#
	# Memory estimate: 381.47 MiB, allocs estimate: 2.

	@benchmark Metal.@sync(euclidean_distance.($(f1), $(f2')))
	# BenchmarkTools.Trial: 124 samples with 1 evaluation.
	# Range (min … max): 88.542 μs … 4.701 s ┊ GC (min … max): 0.00% … 0.00%
	# Time (median): 138.270 μs ┊ GC (median): 0.00%
	# Time (mean ± σ): 49.261 ms ± 422.128 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
	#
	# █ █
	# █▁▁█▁▁▄▁▁▁▁▁▁▁▄▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▄
	# 88.5 μs Histogram: log(frequency) by time 302 ms <
	#
	# Memory estimate: 6.73 KiB, allocs estimate: 272.

	# This shows the superiority of GPU processing pretty easily - 1.3ms vs 39ms is fast indeed.

	# Here's the problem: the GPU only has so much memory, and quite a bit of it is already in use
	# for your display. In addition, special matrix types like `Symmetric`, which halves the computational
	# and memory burden, are not actually available on the GPU.

	# How do we fix this?

	# One possibility is to provide a specific tiling format to the GPU, specifying that
	# only certain blocks within the matrix should actually be computed. This should provide
	# most of the benefit of the tiling with only a bit of waste above the diagonal,
	# if most computation is below it.

	# However, this needs a good knowledge of how many threads are being used and how the architecture of the GPU works.