SimonDanisch/mandelbrot.jl

## mandelbrot.jl
# Super simple SIMD implementation for Complex Numbers
struct ComplexSIMD{N,T}
    re::NTuple{N,T}
    im::NTuple{N,T}
end
Base.abs2(z::ComplexSIMD) = z.re .* z.re .+ z.im .* z.im
Base.:(*)(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re .* b.re .- a.im .* b.im, a.re .* b.im .+ a.im .* b.re)
Base.:(+)(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re .+ b.re, a.im .+ b.im)

function mandelbrot_kernel(c::ComplexSIMD{N,T}) where {N,T}
    z = c
    mask = ntuple(k -> true, N)
    iters = ntuple(k -> T(0), N)
    for i in 1:200
        !any(mask) && return iters
        mask = abs2(z) .<= 4.0
        iters = iters .+ (mask .* 1)
        z = z * z + c
    end
    return iters
end

function inner_loop(result, x_range, y_range, y, ::Val{N}) where N
    ci = y_range[y]
    for x in 1:N:length(x_range)
        c = ComplexSIMD(ntuple(k -> x_range[x+k-1], N), ntuple(k -> ci, N))
        iters = mandelbrot_kernel(c)
        foreach(enumerate(iters)) do (k, v)
            # Write the result to the output array
            result[y, x+k-1] = v
        end
    end
end

function mandelbrot(result, x_range, y_range, N)
    @sync for y in eachindex(y_range)
        Threads.@spawn @inbounds begin
            inner_loop(result, x_range, y_range, y, N)
        end
    end
    return result
end

using BenchmarkTools

N = 960
x_range = range(-2.0, 0.6, N)
y_range = range(-1.5, 1.5, N)
result = zeros(N, N)
@btime mandelbrot(result, x_range, y_range, Val(4));
using GLMakie
heatmap(result, colormap=:viridis, colorscale=sqrt)

## mandelbrot.mojo
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from python import Python
from runtime.llcl import num_cores, Runtime
from algorithm import parallelize, vectorize
from tensor import Tensor
from utils.index import Index

alias width = 960
alias height = 960
alias MAX_ITERS = 200

alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5

alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()


def show_plot(tensor: Tensor[float_type]):
    alias scale = 10
    alias dpi = 64

    np = Python.import_module("numpy")
    plt = Python.import_module("matplotlib.pyplot")
    colors = Python.import_module("matplotlib.colors")

    numpy_array = np.zeros((height, width), np.float64)

    for row in range(height):
        for col in range(width):
            numpy_array.itemset((col, row), tensor[col, row])

    fig = plt.figure(1, [scale, scale * height // width], dpi)
    ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
    light = colors.LightSource(315, 10, 0, 1, 1, 0)

    image = light.shade(
        numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
    )
    plt.imshow(image)
    plt.axis("off")
    plt.show()


fn mandelbrot_kernel_SIMD[
    simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
    """A vectorized implementation of the inner mandelbrot computation."""
    var z = ComplexSIMD[float_type, simd_width](0, 0)
    var iters = SIMD[float_type, simd_width](0)

    var in_set_mask: SIMD[DType.bool, simd_width] = True
    for i in range(MAX_ITERS):
        if not in_set_mask.reduce_or():
            break
        in_set_mask = z.squared_norm() <= 4
        iters = in_set_mask.select(iters + 1, iters)
        z = z.squared_add(c)

    return iters


fn parallelized():
    let t = Tensor[float_type](height, width)

    @parameter
    fn worker(row: Int):
        let scale_x = (max_x - min_x) / width
        let scale_y = (max_y - min_y) / height

        @parameter
        fn compute_vector[simd_width: Int](col: Int):
            """Each time we oeprate on a `simd_width` vector of pixels."""
            let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x
            let cy = min_y + row * scale_y
            let c = ComplexSIMD[float_type, simd_width](cx, cy)
            t.data().simd_store[simd_width](
                row * width + col, mandelbrot_kernel_SIMD[simd_width](c)
            )

        # Vectorize the call to compute_vector where call gets a chunk of pixels.
        vectorize[simd_width, compute_vector](width)

    with Runtime() as rt:

        @parameter
        fn bench_parallel[simd_width: Int]():
            parallelize[worker](rt, height, 5 * num_cores())

        alias simd_width = simdwidthof[DType.float64]()
        let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
        print("Parallelized:", parallelized, "ms")

    try:
        _ = show_plot(t)
    except e:
        print("failed to show plot:", e.value)


def main():
    parallelized()
	# Super simple SIMD implementation for Complex Numbers
	struct ComplexSIMD{N,T}
	re::NTuple{N,T}
	im::NTuple{N,T}
	end
	Base.abs2(z::ComplexSIMD) = z.re .* z.re .+ z.im .* z.im
	Base.:()(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re . b.re .- a.im .* b.im, a.re .* b.im .+ a.im .* b.re)
	Base.:(+)(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re .+ b.re, a.im .+ b.im)

	function mandelbrot_kernel(c::ComplexSIMD{N,T}) where {N,T}
	z = c
	mask = ntuple(k -> true, N)
	iters = ntuple(k -> T(0), N)
	for i in 1:200
	!any(mask) && return iters
	mask = abs2(z) .<= 4.0
	iters = iters .+ (mask .* 1)
	z = z * z + c
	end
	return iters
	end

	function inner_loop(result, x_range, y_range, y, ::Val{N}) where N
	ci = y_range[y]
	for x in 1:N:length(x_range)
	c = ComplexSIMD(ntuple(k -> x_range[x+k-1], N), ntuple(k -> ci, N))
	iters = mandelbrot_kernel(c)
	foreach(enumerate(iters)) do (k, v)
	# Write the result to the output array
	result[y, x+k-1] = v
	end
	end
	end

	function mandelbrot(result, x_range, y_range, N)
	@sync for y in eachindex(y_range)
	Threads.@spawn @inbounds begin
	inner_loop(result, x_range, y_range, y, N)
	end
	end
	return result
	end

	using BenchmarkTools

	N = 960
	x_range = range(-2.0, 0.6, N)
	y_range = range(-1.5, 1.5, N)
	result = zeros(N, N)
	@btime mandelbrot(result, x_range, y_range, Val(4));
	using GLMakie
	heatmap(result, colormap=:viridis, colorscale=sqrt)
	from benchmark import Benchmark
	from complex import ComplexSIMD, ComplexFloat64
	from math import iota
	from python import Python
	from runtime.llcl import num_cores, Runtime
	from algorithm import parallelize, vectorize
	from tensor import Tensor
	from utils.index import Index

	alias width = 960
	alias height = 960
	alias MAX_ITERS = 200

	alias min_x = -2.0
	alias max_x = 0.6
	alias min_y = -1.5
	alias max_y = 1.5

	alias float_type = DType.float64
	alias simd_width = simdwidthof[float_type]()


	def show_plot(tensor: Tensor[float_type]):
	alias scale = 10
	alias dpi = 64

	np = Python.import_module("numpy")
	plt = Python.import_module("matplotlib.pyplot")
	colors = Python.import_module("matplotlib.colors")

	numpy_array = np.zeros((height, width), np.float64)

	for row in range(height):
	for col in range(width):
	numpy_array.itemset((col, row), tensor[col, row])

	fig = plt.figure(1, [scale, scale * height // width], dpi)
	ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
	light = colors.LightSource(315, 10, 0, 1, 1, 0)

	image = light.shade(
	numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
	)
	plt.imshow(image)
	plt.axis("off")
	plt.show()


	fn mandelbrot_kernel_SIMD[
	simd_width: Int
	](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
	"""A vectorized implementation of the inner mandelbrot computation."""
	var z = ComplexSIMD[float_type, simd_width](0, 0)
	var iters = SIMD[float_type, simd_width](0)

	var in_set_mask: SIMD[DType.bool, simd_width] = True
	for i in range(MAX_ITERS):
	if not in_set_mask.reduce_or():
	break
	in_set_mask = z.squared_norm() <= 4
	iters = in_set_mask.select(iters + 1, iters)
	z = z.squared_add(c)

	return iters


	fn parallelized():
	let t = Tensor[float_type](height, width)

	@parameter
	fn worker(row: Int):
	let scale_x = (max_x - min_x) / width
	let scale_y = (max_y - min_y) / height

	@parameter
	fn compute_vector[simd_width: Int](col: Int):
	"""Each time we oeprate on a `simd_width` vector of pixels."""
	let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x
	let cy = min_y + row * scale_y
	let c = ComplexSIMD[float_type, simd_width](cx, cy)
	t.data().simd_store[simd_width](
	row * width + col, mandelbrot_kernel_SIMD[simd_width](c)
	)

	# Vectorize the call to compute_vector where call gets a chunk of pixels.
	vectorize[simd_width, compute_vector](width)

	with Runtime() as rt:

	@parameter
	fn bench_parallel[simd_width: Int]():
	parallelize[worker](rt, height, 5 * num_cores())

	alias simd_width = simdwidthof[DType.float64]()
	let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
	print("Parallelized:", parallelized, "ms")

	try:
	_ = show_plot(t)
	except e:
	print("failed to show plot:", e.value)


	def main():
	parallelized()