exyi/Membench.jl

## Membench.jl
using BenchmarkTools
import Optim


function generate_permutation(size::Int)
    arr = collect(UInt32(1):UInt32(size))
    for i in size:-1:2
        j = rand(1:(i-1))
        arr[i], arr[j] = arr[j], arr[i]
    end
    arr
end

function walk_permutation_ib(perm::Vector{UInt32})
    position = UInt32(1)
    len = 1
    @inbounds while perm[position] != 1
        position = perm[position]
        len += 1

        if len > length(perm) + 10
            return false
        end
    end
    return length(perm) == len
end

function sum_array(perm::Vector{UInt32})
    return sum(perm)
end

function expected_runtime(size::Int32)
    if size * 4 <= 32 * 1024
        # L1
        return 1 * size
    elseif size * 4 <= 256 * 1024
        # L2
        return 4 * size
    elseif size * 4 <= 6 * 1024 * 1024
        # L3
        return 30 * size
    else
        # RAM
        return 100 * size
    end
end

struct Result
    size
    word_size
    latency_min
    latency_med
    latency_std
    latency_iters
    throughput_min
    throughput_med
    throughput_std
    throughput_iters
end

function print_header(file, additional="")
    println(file, "size_num\tsize\tlatency_median\tthroughput_GBps\tlatency_min\tlatency_std\tlatency_test_iterations\tthroughput_median\tthroughput_min\tthroughput_std\tthroughput_test_iterations"*additional)
end

function print_result(file, r:: Result, additional="")
    bytes_size = r.size * r.word_size
    gbps = bytes_size / 1024 / 1024 / 1024 / (r.throughput_med / 1e9)
    size_num = "$(r.size)"
    size = format_bytes(bytes_size)
    latency_median = round(r.latency_med / r.size, digits=8)
    throughput_GBps = round(gbps, digits=8)
    latency_min = round(r.latency_min / r.size, digits=8)
    latency_std = round(r.latency_std / r.size, digits=8)
    latency_test_iterations = "$(r.latency_iters)"
    throughput_median = round(r.throughput_med / r.size, digits=8)
    throughput_min = round(r.throughput_min / r.size, digits=8)
    throughput_std = round(r.throughput_std / r.size, digits=8)
    throughput_test_iterations = "$(r.throughput_iters)"
    println(file, "$size_num\t$size\t$latency_median\t$throughput_GBps\t$latency_min\t$latency_std\t$latency_test_iterations\t$throughput_median\t$throughput_min\t$throughput_std\t$throughput_test_iterations$additional")
end

function format_bytes(x)
    if x < 800
        return "$x B"
    elseif x < 800 * 1024
        return "$(round(x / 1024, sigdigits=3))Ki"
    elseif x < 800 * 1024 * 1024
        return "$(round(x / 1024 / 1024, digits=3))Mi"
    else
        return "$(round(x / 1024 / 1024 / 1024, digits=3))Gi"
    end
end

function run_benchmark(sizes, print_debug=true, print_results=true)
    if print_results
        print_header(stdout)
    end

    results = Vector{Result}()
    for s in sizes
        if print_debug
            println(stderr, "Testing array size $(format_bytes(s))...")
        end
        @assert s % 4 == 0
        perm = generate_permutation(s ÷ 4)
        b_lat = @benchmark walk_permutation_ib($perm)
        if print_debug
            print(stderr, "Latency ")
            show(stderr, MIME("text/plain"), b_lat)
            println(stderr)
        end
        b_thr = @benchmark sum_array($perm)
        if print_debug
            print(stderr, "Throughput ")
            show(stderr, MIME("text/plain"), b_thr)
            println(stderr)
        end
        r = Result(length(perm), 4, minimum(b_lat).time, median(b_lat).time, BenchmarkTools.std(b_lat).time, length(b_lat.times), minimum(b_thr).time, median(b_thr).time, BenchmarkTools.std(b_thr).time, length(b_thr.times))
        if print_results
            print_result(stdout, r)
        end
        push!(results, r)
    end
    return results
end


function latency_function(size, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size)
    pL1 = min(1, L1_size / size)
    pL2 = max(0, min(1, L2_size / size) - pL1)
    pL3 = max(0, min(1, L3_size / size) - pL1 - pL2)
    pL4 = 1 - pL1 - pL2 - pL3
    return pL1 * L1_lat + pL2 * L2_lat + pL3 * L3_lat + pL4 * L4_lat
end

# const test_sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4 * 1024, 8 * 1024] .* 1024
const test_sizes = [16, 32, 48, 64, 96, 128, 256, 512, 768, 1024, 2048, 4 * 1024, 8 * 1024, 12 * 1024, 16 * 1024, 24 * 1024, 32 * 1024, 64 * 1024, 96 * 1024, 128 * 1024, 192 * 1024] .* 1024
const results = run_benchmark(test_sizes)

const latencies = [ r.latency_med / r.size for r in results ]
println(stderr, "Fitting $latencies to $test_sizes")
const L1_lat = latencies[1]
# const L4_size = sum(test_sizes) * 10 # enough RAM
function optim_objective(x)
    L1_size, L2_size, L3_size, L2_lat, L3_lat, L4_lat = x
    predicted_latencies = latency_function.(test_sizes, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size)
    relative_errors = abs.((predicted_latencies .- latencies) ./ latencies)
    # return sum(abs.(predicted_latencies .- latencies))
    # return sum(relative_errors)
    return sum(relative_errors .^ 2 .* (3 .+ log.(1 .+ latencies)))
end

initial_args = [ 16.0 * 1024, 256 * 1024, 30 * 1024 * 1024, 4, 30, 100 ]
opt_result = Optim.optimize(optim_objective, initial_args ./ 100, initial_args.*100, initial_args, Optim.Fminbox())
println(stderr, "Optimization result: ", opt_result)
optargs = Optim.minimizer(opt_result)
println(stderr, "Predicted cache sizes: $(format_bytes.(optargs[1:3]))")
println(stderr, "Fitted cache latencies: $([L1_lat; optargs[4:6]])")
assumed_cache_sizes = [ 32.0, 256, 6 * 1024 ] .* 1024
println(stderr, "Assuming cache sizes: $(format_bytes.(assumed_cache_sizes))")
opt_a_result = Optim.optimize(x -> optim_objective([assumed_cache_sizes; x]), initial_args[4:6]./100, initial_args[4:6].*100, optargs[4:6], Optim.Fminbox())
println(stderr, "Optimization result: ", opt_a_result)
opt_a_args = Optim.minimizer(opt_a_result)
println(stderr, "Fitted cache latencies: $([L1_lat; opt_a_args[1:3]])")

open("output.tsv", "w") do f
    print_header(f, "\tpredition1\tpredition2")
    for r in results
        prediction1 = latency_function(r.size * r.word_size, L1_lat, optargs[4], optargs[5], optargs[6], optargs[1], optargs[2], optargs[3])
        prediction2 = latency_function(r.size * r.word_size, L1_lat, opt_a_args[1], opt_a_args[2], opt_a_args[3], assumed_cache_sizes...)
        print_result(f, r, "\t$prediction1\t$prediction2")
    end
end
	using BenchmarkTools
	import Optim


	function generate_permutation(size::Int)
	arr = collect(UInt32(1):UInt32(size))
	for i in size:-1:2
	j = rand(1:(i-1))
	arr[i], arr[j] = arr[j], arr[i]
	end
	arr
	end

	function walk_permutation_ib(perm::Vector{UInt32})
	position = UInt32(1)
	len = 1
	@inbounds while perm[position] != 1
	position = perm[position]
	len += 1

	if len > length(perm) + 10
	return false
	end
	end
	return length(perm) == len
	end

	function sum_array(perm::Vector{UInt32})
	return sum(perm)
	end

	function expected_runtime(size::Int32)
	if size * 4 <= 32 * 1024
	# L1
	return 1 * size
	elseif size * 4 <= 256 * 1024
	# L2
	return 4 * size
	elseif size * 4 <= 6 * 1024 * 1024
	# L3
	return 30 * size
	else
	# RAM
	return 100 * size
	end
	end

	struct Result
	size
	word_size
	latency_min
	latency_med
	latency_std
	latency_iters
	throughput_min
	throughput_med
	throughput_std
	throughput_iters
	end

	function print_header(file, additional="")
	println(file, "size_num\tsize\tlatency_median\tthroughput_GBps\tlatency_min\tlatency_std\tlatency_test_iterations\tthroughput_median\tthroughput_min\tthroughput_std\tthroughput_test_iterations"*additional)
	end

	function print_result(file, r:: Result, additional="")
	bytes_size = r.size * r.word_size
	gbps = bytes_size / 1024 / 1024 / 1024 / (r.throughput_med / 1e9)
	size_num = "$(r.size)"
	size = format_bytes(bytes_size)
	latency_median = round(r.latency_med / r.size, digits=8)
	throughput_GBps = round(gbps, digits=8)
	latency_min = round(r.latency_min / r.size, digits=8)
	latency_std = round(r.latency_std / r.size, digits=8)
	latency_test_iterations = "$(r.latency_iters)"
	throughput_median = round(r.throughput_med / r.size, digits=8)
	throughput_min = round(r.throughput_min / r.size, digits=8)
	throughput_std = round(r.throughput_std / r.size, digits=8)
	throughput_test_iterations = "$(r.throughput_iters)"
	println(file, "$size_num\t$size\t$latency_median\t$throughput_GBps\t$latency_min\t$latency_std\t$latency_test_iterations\t$throughput_median\t$throughput_min\t$throughput_std\t$throughput_test_iterations$additional")
	end

	function format_bytes(x)
	if x < 800
	return "$x B"
	elseif x < 800 * 1024
	return "$(round(x / 1024, sigdigits=3))Ki"
	elseif x < 800 * 1024 * 1024
	return "$(round(x / 1024 / 1024, digits=3))Mi"
	else
	return "$(round(x / 1024 / 1024 / 1024, digits=3))Gi"
	end
	end

	function run_benchmark(sizes, print_debug=true, print_results=true)
	if print_results
	print_header(stdout)
	end

	results = Vector{Result}()
	for s in sizes
	if print_debug
	println(stderr, "Testing array size $(format_bytes(s))...")
	end
	@assert s % 4 == 0
	perm = generate_permutation(s ÷ 4)
	b_lat = @benchmark walk_permutation_ib($perm)
	if print_debug
	print(stderr, "Latency ")
	show(stderr, MIME("text/plain"), b_lat)
	println(stderr)
	end
	b_thr = @benchmark sum_array($perm)
	if print_debug
	print(stderr, "Throughput ")
	show(stderr, MIME("text/plain"), b_thr)
	println(stderr)
	end
	r = Result(length(perm), 4, minimum(b_lat).time, median(b_lat).time, BenchmarkTools.std(b_lat).time, length(b_lat.times), minimum(b_thr).time, median(b_thr).time, BenchmarkTools.std(b_thr).time, length(b_thr.times))
	if print_results
	print_result(stdout, r)
	end
	push!(results, r)
	end
	return results
	end


	function latency_function(size, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size)
	pL1 = min(1, L1_size / size)
	pL2 = max(0, min(1, L2_size / size) - pL1)
	pL3 = max(0, min(1, L3_size / size) - pL1 - pL2)
	pL4 = 1 - pL1 - pL2 - pL3
	return pL1 * L1_lat + pL2 * L2_lat + pL3 * L3_lat + pL4 * L4_lat
	end

	# const test_sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4 * 1024, 8 * 1024] .* 1024
	const test_sizes = [16, 32, 48, 64, 96, 128, 256, 512, 768, 1024, 2048, 4 * 1024, 8 * 1024, 12 * 1024, 16 * 1024, 24 * 1024, 32 * 1024, 64 * 1024, 96 * 1024, 128 * 1024, 192 * 1024] .* 1024
	const results = run_benchmark(test_sizes)

	const latencies = [ r.latency_med / r.size for r in results ]
	println(stderr, "Fitting $latencies to $test_sizes")
	const L1_lat = latencies[1]
	# const L4_size = sum(test_sizes) * 10 # enough RAM
	function optim_objective(x)
	L1_size, L2_size, L3_size, L2_lat, L3_lat, L4_lat = x
	predicted_latencies = latency_function.(test_sizes, L1_lat, L2_lat, L3_lat, L4_lat, L1_size, L2_size, L3_size)
	relative_errors = abs.((predicted_latencies .- latencies) ./ latencies)
	# return sum(abs.(predicted_latencies .- latencies))
	# return sum(relative_errors)
	return sum(relative_errors .^ 2 .* (3 .+ log.(1 .+ latencies)))
	end

	initial_args = [ 16.0 * 1024, 256 * 1024, 30 * 1024 * 1024, 4, 30, 100 ]
	opt_result = Optim.optimize(optim_objective, initial_args ./ 100, initial_args.*100, initial_args, Optim.Fminbox())
	println(stderr, "Optimization result: ", opt_result)
	optargs = Optim.minimizer(opt_result)
	println(stderr, "Predicted cache sizes: $(format_bytes.(optargs[1:3]))")
	println(stderr, "Fitted cache latencies: $([L1_lat; optargs[4:6]])")
	assumed_cache_sizes = [ 32.0, 256, 6 * 1024 ] .* 1024
	println(stderr, "Assuming cache sizes: $(format_bytes.(assumed_cache_sizes))")
	opt_a_result = Optim.optimize(x -> optim_objective([assumed_cache_sizes; x]), initial_args[4:6]./100, initial_args[4:6].*100, optargs[4:6], Optim.Fminbox())
	println(stderr, "Optimization result: ", opt_a_result)
	opt_a_args = Optim.minimizer(opt_a_result)
	println(stderr, "Fitted cache latencies: $([L1_lat; opt_a_args[1:3]])")

	open("output.tsv", "w") do f
	print_header(f, "\tpredition1\tpredition2")
	for r in results
	prediction1 = latency_function(r.size * r.word_size, L1_lat, optargs[4], optargs[5], optargs[6], optargs[1], optargs[2], optargs[3])
	prediction2 = latency_function(r.size * r.word_size, L1_lat, opt_a_args[1], opt_a_args[2], opt_a_args[3], assumed_cache_sizes...)
	print_result(f, r, "\t$prediction1\t$prediction2")
	end
	end