Skip to content

Instantly share code, notes, and snippets.

@mdmaas
Last active September 29, 2023 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mdmaas/d1b6b1a69a6b235143d7110237ff4ae8 to your computer and use it in GitHub Desktop.
Save mdmaas/d1b6b1a69a6b235143d7110237ff4ae8 to your computer and use it in GitHub Desktop.
Simple performance benchmark of Julia custom array types
using LoopVectorization
using Bumper
using StrideArraysCore
using StaticTools
set_default_buffer_size!(1000)
@inline function sumArray_alloc(N)
smallarray = Array{Float64}(undef,N)
@turbo for i ∈ 1:N
smallarray[i] = 1.0 / i^2
end
sum = 0.0
@turbo for i ∈ 1:N
sum += smallarray[i]
end
return sum
end
@inline function sumArray_malloc(N)
smallarray = MallocArray{Float64}(undef, N)
@turbo for i ∈ 1:N
smallarray[i] = 1.0 / i^2
end
sum = 0.0
@turbo for i ∈ 1:N
sum += smallarray[i]
end
free(smallarray)
return sum
end
@inline function sumArray_bumper(N)
@no_escape begin
smallarray = alloc(Float64, N)
@turbo for i ∈ 1:N
smallarray[i] = 1.0 / i^2
end
sum = 0.0
@turbo for i ∈ 1:N
sum += smallarray[i]
end
end
return sum
end
@inline function sumArray_prealloc(N, smallarray)
@turbo for i ∈ 1:N
smallarray[i] = 1.0 / i^2
end
sum = 0.0
@turbo for i ∈ 1:N
sum += smallarray[i]
end
return sum
end
function test_alloc(N)
rep = 10000
x = 0.0
for i ∈ 1:rep
x = sumArray_alloc(N)
end
end
function test_bumper(N)
rep = 10000
x = 0.0
for i ∈ 1:rep
x = sumArray_bumper(N)
end
end
function test_malloc(N)
rep = 10000
x = 0.0
for i ∈ 1:rep
x = sumArray_malloc(N)
end
end
function test_prealloc(N)
rep = 10000
smallarray = Array{Float64}(undef,N)
x = 0.0
for i ∈ 1:rep
x = sumArray_prealloc(N,smallarray)
end
end
using Libdl
C_code = """
#include <stdlib.h>
#include <math.h>
#include <omp.h>
double sumCStackArray( int N ) {
double smallarray[N];
for(unsigned int k = 0; k<N; k++){
smallarray[k] = 1.0/pow(k+1,2);
}
double sum = 0.0;
#pragma omp simd reduction(+:sum)
for(unsigned int k = 0; k<N; k++){
sum += smallarray[k];
}
return sum;
}
double sumCHeapArray( int N ) {
double * smallarray = malloc(N * sizeof(double));
for(unsigned int k = 0; k<N; k++){
smallarray[k] = 1.0/pow(k+1,2);
}
double sum = 0.0;
#pragma omp simd reduction(+:sum)
for(unsigned int k = 0; k<N; k++){
sum += smallarray[k];
}
free(smallarray);
return sum;
}
"""
Clib = "libarray"
open(`gcc -fPIC -O3 -fargument-noalias -fopenmp -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
print(f, C_code)
end
sumCStackArray(N) = @ccall "./libarray.so".sumCStackArray(N::Cint)::Cdouble
sumCHeapArray(N) = @ccall "./libarray.so".sumCHeapArray(N::Cint)::Cdouble
function test_cstackarray(N)
rep = 10000
x = 0.0
for i ∈ 1:rep
x = sumCStackArray(N)
end
end
function test_cheaparray(N)
rep = 10000
x = 0.0
for i ∈ 1:rep
x = sumCHeapArray(N)
end
end
time_sumCArray(N, REP) = @ccall "./libarray.so".timesumCArray(N::Cint, REP::Cint)::Cdouble
test_c_timing(N) = time_sumCArray(N, 10000)
Ns = 5:2:100
t_cheat = [(@elapsed test_cheat(N))*1e6 for N ∈ Ns]
t_alloc = [(@elapsed test_alloc(N))*1e6 for N ∈ Ns]
t_malloc = [(@elapsed test_malloc(N))*1e6 for N ∈ Ns]
t_bumper = [(@elapsed test_bumper(N))*1e6 for N ∈ Ns]
t_prealloc = [(@elapsed test_prealloc(N))*1e6 for N ∈ Ns]
t_pre_malloc = [(@elapsed test_pre_malloc(N))*1e6 for N ∈ Ns]
t_cstack = [(@elapsed test_cstackarray(N))*1e6 for N ∈ Ns]
t_cheap = [(@elapsed test_cheaparray(N))*1e6 for N ∈ Ns]
mean(x) = sum(x) / length(x)
using Plots
gr()
# plot(Ns, t_alloc./t_cstack, label="Julia Arrays")
plot(Ns, t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
plot!(Ns, t_malloc./t_cstack, label="Julia MallocArrays")
plot!(Ns, t_pre_malloc./t_cstack, label="Julia Pre-MallocArrays")
plot!(Ns, t_bumper./t_cstack, label="Bumper+StrideArrays")
plot!(Ns, t_cheap./t_cstack, label="C Heap Array")
plot!(xlabel="N", ylabel="Cost wrt C stack-allocation")
plot!(Ns, ones(size(Ns)), color=:black, label="C Stack Arrays")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment