Created
August 13, 2023 11:39
-
-
Save exyi/0303e9472e116e325688b8d96f2847cb to your computer and use it in GitHub Desktop.
Memory latency and bogotroughput benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkTools | |
import Optim | |
function generate_permutation(size::Int) | |
arr = collect(UInt32(1):UInt32(size)) | |
for i in size:-1:2 | |
j = rand(1:(i-1)) | |
arr[i], arr[j] = arr[j], arr[i] | |
end | |
arr | |
end | |
function walk_permutation_ib(perm::Vector{UInt32}) | |
position = UInt32(1) | |
len = 1 | |
@inbounds while perm[position] != 1 | |
position = perm[position] | |
len += 1 | |
if len > length(perm) + 10 | |
return false | |
end | |
end | |
return length(perm) == len | |
end | |
function sum_array(perm::Vector{UInt32}) | |
return sum(perm) | |
end | |
function expected_runtime(size::Int32) | |
if size * 4 <= 32 * 1024 | |
# L1 | |
return 1 * size | |
elseif size * 4 <= 256 * 1024 | |
# L2 | |
return 4 * size | |
elseif size * 4 <= 6 * 1024 * 1024 | |
# L3 | |
return 30 * size | |
else | |
# RAM | |
return 100 * size | |
end | |
end | |
struct Result | |
size | |
word_size | |
latency_min | |
latency_med | |
latency_std | |
latency_iters | |
throughput_min | |
throughput_med | |
throughput_std | |
throughput_iters | |
end | |
function print_header(file, additional="") | |
println(file, "size_num\tsize\tlatency_median\tthroughput_GBps\tlatency_min\tlatency_std\tlatency_test_iterations\tthroughput_median\tthroughput_min\tthroughput_std\tthroughput_test_iterations"*additional) | |
end | |
function print_result(file, r:: Result, additional="") | |
bytes_size = r.size * r.word_size | |
gbps = bytes_size / 1024 / 1024 / 1024 / (r.throughput_med / 1e9) | |
size_num = "$(r.size)" | |
size = format_bytes(bytes_size) | |
latency_median = round(r.latency_med / r.size, digits=8) | |
throughput_GBps = round(gbps, digits=8) | |
latency_min = round(r.latency_min / r.size, digits=8) | |
latency_std = round(r.latency_std / r.size, digits=8) | |
latency_test_iterations = "$(r.latency_iters)" | |
throughput_median = round(r.throughput_med / r.size, digits=8) | |
throughput_min = round(r.throughput_min / r.size, digits=8) | |
throughput_std = round(r.throughput_std / r.size, digits=8) | |
throughput_test_iterations = "$(r.throughput_iters)" | |
println(file, "$size_num\t$size\t$latency_median\t$throughput_GBps\t$latency_min\t$latency_std\t$latency_test_iterations\t$throughput_median\t$throughput_min\t$throughput_std\t$throughput_test_iterations$additional") | |
end | |
function format_bytes(x) | |
if x < 800 | |
return "$x B" | |
elseif x < 800 * 1024 | |
return "$(round(x / 1024, sigdigits=3))Ki" | |
elseif x < 800 * 1024 * 1024 | |
return "$(round(x / 1024 / 1024, digits=3))Mi" | |
else | |
return "$(round(x / 1024 / 1024 / 1024, digits=3))Gi" | |
end | |
end | |
function run_benchmark(sizes, print_debug=true, print_results=true) | |
if print_results | |
print_header(stdout) | |
end | |
results = Vector{Result}() | |
for s in sizes | |
if print_debug | |
println(stderr, "Testing array size $(format_bytes(s))...") | |
end | |
@assert s % 4 == 0 | |
perm = generate_permutation(s ÷ 4) | |
b_lat = @benchmark walk_permutation_ib($perm) | |
if print_debug | |
print(stderr, "Latency ") | |
show(stderr, MIME("text/plain"), b_lat) | |
println(stderr) | |
end | |
b_thr = @benchmark sum_array($perm) | |
if print_debug | |
print(stderr, "Throughput ") | |
show(stderr, MIME("text/plain"), b_thr) | |
println(stderr) | |
end | |
r = Result(length(perm), 4, minimum(b_lat).time, median(b_lat).time, BenchmarkTools.std(b_lat).time, length(b_lat.times), minimum(b_thr).time, median(b_thr).time, BenchmarkTools.std(b_thr).time, length(b_thr.times)) | |
if print_results | |
print_result(stdout, r) | |
end | |
push!(results, r) | |
end | |
return results | |
end | |
function latency_function(size, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size) | |
pL1 = min(1, L1_size / size) | |
pL2 = max(0, min(1, L2_size / size) - pL1) | |
pL3 = max(0, min(1, L3_size / size) - pL1 - pL2) | |
pL4 = 1 - pL1 - pL2 - pL3 | |
return pL1 * L1_lat + pL2 * L2_lat + pL3 * L3_lat + pL4 * L4_lat | |
end | |
# const test_sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4 * 1024, 8 * 1024] .* 1024 | |
const test_sizes = [16, 32, 48, 64, 96, 128, 256, 512, 768, 1024, 2048, 4 * 1024, 8 * 1024, 12 * 1024, 16 * 1024, 24 * 1024, 32 * 1024, 64 * 1024, 96 * 1024, 128 * 1024, 192 * 1024] .* 1024 | |
const results = run_benchmark(test_sizes) | |
const latencies = [ r.latency_med / r.size for r in results ] | |
println(stderr, "Fitting $latencies to $test_sizes") | |
const L1_lat = latencies[1] | |
# const L4_size = sum(test_sizes) * 10 # enough RAM | |
function optim_objective(x) | |
L1_size, L2_size, L3_size, L2_lat, L3_lat, L4_lat = x | |
predicted_latencies = latency_function.(test_sizes, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size) | |
relative_errors = abs.((predicted_latencies .- latencies) ./ latencies) | |
# return sum(abs.(predicted_latencies .- latencies)) | |
# return sum(relative_errors) | |
return sum(relative_errors .^ 2 .* (3 .+ log.(1 .+ latencies))) | |
end | |
initial_args = [ 16.0 * 1024, 256 * 1024, 30 * 1024 * 1024, 4, 30, 100 ] | |
opt_result = Optim.optimize(optim_objective, initial_args ./ 100, initial_args.*100, initial_args, Optim.Fminbox()) | |
println(stderr, "Optimization result: ", opt_result) | |
optargs = Optim.minimizer(opt_result) | |
println(stderr, "Predicted cache sizes: $(format_bytes.(optargs[1:3]))") | |
println(stderr, "Fitted cache latencies: $([L1_lat; optargs[4:6]])") | |
assumed_cache_sizes = [ 32.0, 256, 6 * 1024 ] .* 1024 | |
println(stderr, "Assuming cache sizes: $(format_bytes.(assumed_cache_sizes))") | |
opt_a_result = Optim.optimize(x -> optim_objective([assumed_cache_sizes; x]), initial_args[4:6]./100, initial_args[4:6].*100, optargs[4:6], Optim.Fminbox()) | |
println(stderr, "Optimization result: ", opt_a_result) | |
opt_a_args = Optim.minimizer(opt_a_result) | |
println(stderr, "Fitted cache latencies: $([L1_lat; opt_a_args[1:3]])") | |
open("output.tsv", "w") do f | |
print_header(f, "\tpredition1\tpredition2") | |
for r in results | |
prediction1 = latency_function(r.size * r.word_size, L1_lat, optargs[4], optargs[5], optargs[6], optargs[1], optargs[2], optargs[3]) | |
prediction2 = latency_function(r.size * r.word_size, L1_lat, opt_a_args[1], opt_a_args[2], opt_a_args[3], assumed_cache_sizes...) | |
print_result(f, r, "\t$prediction1\t$prediction2") | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment