Skip to content

Instantly share code, notes, and snippets.

@SimonDanisch
Created September 9, 2023 09:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SimonDanisch/037f4561f08e603f1f4f092d5648019b to your computer and use it in GitHub Desktop.
Save SimonDanisch/037f4561f08e603f1f4f092d5648019b to your computer and use it in GitHub Desktop.
Mojo vs Julia mandelbrot benchmark
# Super simple SIMD implementation for Complex Numbers
struct ComplexSIMD{N,T}
re::NTuple{N,T}
im::NTuple{N,T}
end
Base.abs2(z::ComplexSIMD) = z.re .* z.re .+ z.im .* z.im
Base.:(*)(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re .* b.re .- a.im .* b.im, a.re .* b.im .+ a.im .* b.re)
Base.:(+)(a::ComplexSIMD, b::ComplexSIMD) = ComplexSIMD(a.re .+ b.re, a.im .+ b.im)
function mandelbrot_kernel(c::ComplexSIMD{N,T}) where {N,T}
z = c
mask = ntuple(k -> true, N)
iters = ntuple(k -> T(0), N)
for i in 1:200
!any(mask) && return iters
mask = abs2(z) .<= 4.0
iters = iters .+ (mask .* 1)
z = z * z + c
end
return iters
end
function inner_loop(result, x_range, y_range, y, ::Val{N}) where N
ci = y_range[y]
for x in 1:N:length(x_range)
c = ComplexSIMD(ntuple(k -> x_range[x+k-1], N), ntuple(k -> ci, N))
iters = mandelbrot_kernel(c)
foreach(enumerate(iters)) do (k, v)
# Write the result to the output array
result[y, x+k-1] = v
end
end
end
function mandelbrot(result, x_range, y_range, N)
@sync for y in eachindex(y_range)
Threads.@spawn @inbounds begin
inner_loop(result, x_range, y_range, y, N)
end
end
return result
end
using BenchmarkTools
N = 960
x_range = range(-2.0, 0.6, N)
y_range = range(-1.5, 1.5, N)
result = zeros(N, N)
@btime mandelbrot(result, x_range, y_range, Val(4));
using GLMakie
heatmap(result, colormap=:viridis, colorscale=sqrt)
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from python import Python
from runtime.llcl import num_cores, Runtime
from algorithm import parallelize, vectorize
from tensor import Tensor
from utils.index import Index
alias width = 960
alias height = 960
alias MAX_ITERS = 200
alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5
alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()
def show_plot(tensor: Tensor[float_type]):
alias scale = 10
alias dpi = 64
np = Python.import_module("numpy")
plt = Python.import_module("matplotlib.pyplot")
colors = Python.import_module("matplotlib.colors")
numpy_array = np.zeros((height, width), np.float64)
for row in range(height):
for col in range(width):
numpy_array.itemset((col, row), tensor[col, row])
fig = plt.figure(1, [scale, scale * height // width], dpi)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
light = colors.LightSource(315, 10, 0, 1, 1, 0)
image = light.shade(
numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
)
plt.imshow(image)
plt.axis("off")
plt.show()
fn mandelbrot_kernel_SIMD[
simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
"""A vectorized implementation of the inner mandelbrot computation."""
var z = ComplexSIMD[float_type, simd_width](0, 0)
var iters = SIMD[float_type, simd_width](0)
var in_set_mask: SIMD[DType.bool, simd_width] = True
for i in range(MAX_ITERS):
if not in_set_mask.reduce_or():
break
in_set_mask = z.squared_norm() <= 4
iters = in_set_mask.select(iters + 1, iters)
z = z.squared_add(c)
return iters
fn parallelized():
let t = Tensor[float_type](height, width)
@parameter
fn worker(row: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height
@parameter
fn compute_vector[simd_width: Int](col: Int):
"""Each time we oeprate on a `simd_width` vector of pixels."""
let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x
let cy = min_y + row * scale_y
let c = ComplexSIMD[float_type, simd_width](cx, cy)
t.data().simd_store[simd_width](
row * width + col, mandelbrot_kernel_SIMD[simd_width](c)
)
# Vectorize the call to compute_vector where call gets a chunk of pixels.
vectorize[simd_width, compute_vector](width)
with Runtime() as rt:
@parameter
fn bench_parallel[simd_width: Int]():
parallelize[worker](rt, height, 5 * num_cores())
alias simd_width = simdwidthof[DType.float64]()
let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
print("Parallelized:", parallelized, "ms")
try:
_ = show_plot(t)
except e:
print("failed to show plot:", e.value)
def main():
parallelized()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment