staticfloat/flux_multithreaded_training.jl

## flux_multithreaded_training.jl
using Flux, Printf, Statistics

# Simple model
function gen_model()
    return Chain(
        Dense(8, 8, σ),
        Dense(8, 8, σ),
        Dense(8, 8, σ),
        Dense(8, 1, σ),
    )
end

# Static dataset
function gen_dataset(batch_size = 128, num_minibatches = 256)
    # Return an array of (x, y) tuples
    return [(
        randn(8, batch_size),
        randn(1, batch_size),
    ) for _ in 1:num_minibatches]
end

# Helper function to print some `@timed` stats
function info_stats(msg, stats, num_epochs)
    @info(
        msg,
        time=stats.time,
        time_per_epoch=stats.time/num_epochs,
        gc=@sprintf("%.1f%%", stats.gctime*100.0/stats.time),
        allocated=Base.format_bytes(stats.bytes),
    )
end

# Warm up codegen
@info("Warming up Flux.train!()")
begin
    model = gen_model()
    dataset = gen_dataset()
    opt = Flux.Optimise.ADAM(1e-4)
    num_epochs = 20
    stats = @timed begin
        for idx in 1:num_epochs
            Flux.train!(
                (x, y) -> Flux.Losses.mse(model(x), y),
                Flux.params(model),
                dataset,
                opt,
            )
        end
    end
    info_stats("First warm-up completed", stats, num_epochs)

    stats = @timed begin
        for idx in 1:4
            Flux.train!(
                (x, y) -> Flux.Losses.mse(model(x), y),
                Flux.params(model),
                dataset,
                opt,
            )
        end
    end
    info_stats("Second warm-up completed", stats, num_epochs)
end


# Train `num_models` in parallel, each on its own thread
num_models = 32
datasets = [gen_dataset() for _ in 1:num_models]
models = [gen_model() for _ in 1:num_models]
training_stats = [Any[] for _ in 1:num_models]

@warn("Beginning training with $(Threads.nthreads()) threads")
Threads.@threads for model_idx in 1:num_models
    model = models[model_idx]
    dataset = datasets[model_idx]
    opt = Flux.Optimise.ADAM(1e-4)

    # Threads will die off one by one, so we can see how reducing the number of threads
    # eases off GC pressure over the whole cohort
    for idx in 1:(model_idx*num_epochs)
        push!(training_stats[model_idx], @timed begin
            Flux.train!(
                (x, y) -> Flux.Losses.mse(model(x), y),
                Flux.params(model),
                dataset,
                opt,
            )
        end)
    end

    # Calculate mean statistics over the tail end of this thread's lifetime
    tail_stats = training_stats[model_idx][end-num_epochs+1:end]
    mean_stats = (;
        time = mean(s.time for s in tail_stats),
        gctime = mean(s.gctime for s in tail_stats),
        bytes = mean(s.bytes for s in tail_stats),
    )
    info_stats("Finished model $(model_idx)", mean_stats, model_idx*num_epochs)
end

# Create nice graph at the end of moving-average GC time %
@info("Finished training")
using CairoMakie
fig = Figure()
ax = Axis(fig[1,1])
for model_idx in 10:num_models
    filt_len = 2*num_epochs
    gctimes = [s.gctime for s in training_stats[model_idx]]
    gctimes_filt = conv(gctimes[:,:,:], ones(filt_len,1,1)./filt_len)[:]
    times = [s.time for s in training_stats[model_idx]]
    times_filt = conv(times[:,:,:], ones(filt_len,1,1)./filt_len)[:]

    lines!(ax, gctimes_filt.*100.0./times_filt)
    #lines!(ax, times_filt .- gctimes_filt)
end
ax.title = "average GC time % by epoch"
save("training_stats.png", fig)

## output.log
$ julia --project --threads=auto -i flux_multithreaded_training.jl
[ Info: Warming up Flux.train!()
┌ Info: First warm-up completed
│   time = 24.151648601
│   time_per_epoch = 1.20758243005
│   gc = "6.6%"
└   allocated = "4.288 GiB"
┌ Info: Second warm-up completed
│   time = 0.292563925
│   time_per_epoch = 0.01462819625
│   gc = "12.1%"
└   allocated = "188.549 MiB"
┌ Warning: Beginning training with 64 threads
└ @ Main ~/src/surrogate_testing/flux_multithreaded_training.jl:67
┌ Info: Finished model 1
│   time = 0.96046645575
│   time_per_epoch = 0.0480233227875
│   gc = "62.2%"
└   allocated = "1.397 GiB"
┌ Info: Finished model 2
│   time = 0.6822931502
│   time_per_epoch = 0.017057328755
│   gc = "72.6%"
└   allocated = "1.361 GiB"
┌ Info: Finished model 3
│   time = 0.63013394675
│   time_per_epoch = 0.010502232445833334
│   gc = "71.2%"
└   allocated = "1.303 GiB"
┌ Info: Finished model 4
│   time = 0.6284951171
│   time_per_epoch = 0.00785618896375
│   gc = "69.2%"
└   allocated = "1.258 GiB"
┌ Info: Finished model 5
│   time = 0.5810092911
│   time_per_epoch = 0.005810092911
│   gc = "71.5%"
└   allocated = "1.288 GiB"
┌ Info: Finished model 6
│   time = 0.5578616940000001
│   time_per_epoch = 0.004648847450000001
│   gc = "68.6%"
└   allocated = "1.197 GiB"
┌ Info: Finished model 7
│   time = 0.5249345326
│   time_per_epoch = 0.003749532375714286
│   gc = "67.2%"
└   allocated = "1.146 GiB"
┌ Info: Finished model 8
│   time = 0.4802209392000001
│   time_per_epoch = 0.0030013808700000005
│   gc = "70.2%"
└   allocated = "1.100 GiB"
┌ Info: Finished model 9
│   time = 0.4712065032
│   time_per_epoch = 0.0026178139066666667
│   gc = "66.0%"
└   allocated = "1.059 GiB"
┌ Info: Finished model 10
│   time = 0.46222509810000006
│   time_per_epoch = 0.0023111254905000002
│   gc = "67.5%"
└   allocated = "1.037 GiB"
┌ Info: Finished model 11
│   time = 0.41497993315
│   time_per_epoch = 0.0018862724234090908
│   gc = "65.9%"
└   allocated = "996.651 MiB"
┌ Info: Finished model 12
│   time = 0.39126343464999996
│   time_per_epoch = 0.0016302643110416666
│   gc = "64.4%"
└   allocated = "950.995 MiB"
┌ Info: Finished model 13
│   time = 0.382742247
│   time_per_epoch = 0.0014720855653846154
│   gc = "64.0%"
└   allocated = "916.340 MiB"
┌ Info: Finished model 14
│   time = 0.34288418504999996
│   time_per_epoch = 0.0012245863751785713
│   gc = "64.1%"
└   allocated = "852.925 MiB"
┌ Info: Finished model 15
│   time = 0.29296357749999996
│   time_per_epoch = 0.0009765452583333332
│   gc = "62.7%"
└   allocated = "791.917 MiB"
┌ Info: Finished model 16
│   time = 0.27802489244999995
│   time_per_epoch = 0.0008688277889062499
│   gc = "63.1%"
└   allocated = "747.786 MiB"
┌ Info: Finished model 17
│   time = 0.27814960005
│   time_per_epoch = 0.0008180870589705883
│   gc = "62.4%"
└   allocated = "774.939 MiB"
┌ Info: Finished model 18
│   time = 0.22394830665
│   time_per_epoch = 0.0006220786295833334
│   gc = "60.7%"
└   allocated = "663.826 MiB"
┌ Info: Finished model 19
│   time = 0.23692811540000003
│   time_per_epoch = 0.0006234950405263159
│   gc = "60.3%"
└   allocated = "657.192 MiB"
┌ Info: Finished model 20
│   time = 0.1871665368
│   time_per_epoch = 0.000467916342
│   gc = "58.2%"
└   allocated = "583.980 MiB"
┌ Info: Finished model 21
│   time = 0.16696599799999998
│   time_per_epoch = 0.0003975380904761904
│   gc = "60.0%"
└   allocated = "573.366 MiB"
┌ Info: Finished model 22
│   time = 0.1810929687
│   time_per_epoch = 0.00041157492886363634
│   gc = "57.9%"
└   allocated = "555.059 MiB"
┌ Info: Finished model 24
│   time = 0.134708169
│   time_per_epoch = 0.00028064201875
│   gc = "55.2%"
└   allocated = "438.462 MiB"
┌ Info: Finished model 23
│   time = 0.14440032309999998
│   time_per_epoch = 0.0003139137458695652
│   gc = "53.9%"
└   allocated = "469.888 MiB"
┌ Info: Finished model 25
│   time = 0.11250617895
│   time_per_epoch = 0.0002250123579
│   gc = "53.4%"
└   allocated = "350.242 MiB"
┌ Info: Finished model 26
│   time = 0.09209255605
│   time_per_epoch = 0.00017710106932692307
│   gc = "46.9%"
└   allocated = "283.863 MiB"
┌ Info: Finished model 28
│   time = 0.08827645390000001
│   time_per_epoch = 0.0001576365248214286
│   gc = "43.5%"
└   allocated = "250.151 MiB"
┌ Info: Finished model 29
│   time = 0.07947673274999997
│   time_per_epoch = 0.00013702884956896547
│   gc = "38.1%"
└   allocated = "220.892 MiB"
┌ Info: Finished model 27
│   time = 0.09358652614999999
│   time_per_epoch = 0.00017330838175925924
│   gc = "38.2%"
└   allocated = "248.892 MiB"
┌ Info: Finished model 31
│   time = 0.06465937009999999
│   time_per_epoch = 0.0001042893066129032
│   gc = "28.0%"
└   allocated = "126.471 MiB"
┌ Info: Finished model 30
│   time = 0.07568235385
│   time_per_epoch = 0.00012613725641666667
│   gc = "21.4%"
└   allocated = "106.260 MiB"
┌ Info: Finished model 32
│   time = 0.041342464450000005
│   time_per_epoch = 6.459760070312501e-5
│   gc = "17.9%"
└   allocated = "45.388 MiB"
[ Info: Finished training
	using Flux, Printf, Statistics

	# Simple model
	function gen_model()
	return Chain(
	Dense(8, 8, σ),
	Dense(8, 8, σ),
	Dense(8, 8, σ),
	Dense(8, 1, σ),
	)
	end

	# Static dataset
	function gen_dataset(batch_size = 128, num_minibatches = 256)
	# Return an array of (x, y) tuples
	return [(
	randn(8, batch_size),
	randn(1, batch_size),
	) for _ in 1:num_minibatches]
	end

	# Helper function to print some `@timed` stats
	function info_stats(msg, stats, num_epochs)
	@info(
	msg,
	time=stats.time,
	time_per_epoch=stats.time/num_epochs,
	gc=@sprintf("%.1f%%", stats.gctime*100.0/stats.time),
	allocated=Base.format_bytes(stats.bytes),
	)
	end

	# Warm up codegen
	@info("Warming up Flux.train!()")
	begin
	model = gen_model()
	dataset = gen_dataset()
	opt = Flux.Optimise.ADAM(1e-4)
	num_epochs = 20
	stats = @timed begin
	for idx in 1:num_epochs
	Flux.train!(
	(x, y) -> Flux.Losses.mse(model(x), y),
	Flux.params(model),
	dataset,
	opt,
	)
	end
	end
	info_stats("First warm-up completed", stats, num_epochs)

	stats = @timed begin
	for idx in 1:4
	Flux.train!(
	(x, y) -> Flux.Losses.mse(model(x), y),
	Flux.params(model),
	dataset,
	opt,
	)
	end
	end
	info_stats("Second warm-up completed", stats, num_epochs)
	end


	# Train `num_models` in parallel, each on its own thread
	num_models = 32
	datasets = [gen_dataset() for _ in 1:num_models]
	models = [gen_model() for _ in 1:num_models]
	training_stats = [Any[] for _ in 1:num_models]

	@warn("Beginning training with $(Threads.nthreads()) threads")
	Threads.@threads for model_idx in 1:num_models
	model = models[model_idx]
	dataset = datasets[model_idx]
	opt = Flux.Optimise.ADAM(1e-4)

	# Threads will die off one by one, so we can see how reducing the number of threads
	# eases off GC pressure over the whole cohort
	for idx in 1:(model_idx*num_epochs)
	push!(training_stats[model_idx], @timed begin
	Flux.train!(
	(x, y) -> Flux.Losses.mse(model(x), y),
	Flux.params(model),
	dataset,
	opt,
	)
	end)
	end

	# Calculate mean statistics over the tail end of this thread's lifetime
	tail_stats = training_stats[model_idx][end-num_epochs+1:end]
	mean_stats = (;
	time = mean(s.time for s in tail_stats),
	gctime = mean(s.gctime for s in tail_stats),
	bytes = mean(s.bytes for s in tail_stats),
	)
	info_stats("Finished model $(model_idx)", mean_stats, model_idx*num_epochs)
	end

	# Create nice graph at the end of moving-average GC time %
	@info("Finished training")
	using CairoMakie
	fig = Figure()
	ax = Axis(fig[1,1])
	for model_idx in 10:num_models
	filt_len = 2*num_epochs
	gctimes = [s.gctime for s in training_stats[model_idx]]
	gctimes_filt = conv(gctimes[:,:,:], ones(filt_len,1,1)./filt_len)[:]
	times = [s.time for s in training_stats[model_idx]]
	times_filt = conv(times[:,:,:], ones(filt_len,1,1)./filt_len)[:]

	lines!(ax, gctimes_filt.*100.0./times_filt)
	#lines!(ax, times_filt .- gctimes_filt)
	end
	ax.title = "average GC time % by epoch"
	save("training_stats.png", fig)
	$ julia --project --threads=auto -i flux_multithreaded_training.jl
	[ Info: Warming up Flux.train!()
	┌ Info: First warm-up completed
	│ time = 24.151648601
	│ time_per_epoch = 1.20758243005
	│ gc = "6.6%"
	└ allocated = "4.288 GiB"
	┌ Info: Second warm-up completed
	│ time = 0.292563925
	│ time_per_epoch = 0.01462819625
	│ gc = "12.1%"
	└ allocated = "188.549 MiB"
	┌ Warning: Beginning training with 64 threads
	└ @ Main ~/src/surrogate_testing/flux_multithreaded_training.jl:67
	┌ Info: Finished model 1
	│ time = 0.96046645575
	│ time_per_epoch = 0.0480233227875
	│ gc = "62.2%"
	└ allocated = "1.397 GiB"
	┌ Info: Finished model 2
	│ time = 0.6822931502
	│ time_per_epoch = 0.017057328755
	│ gc = "72.6%"
	└ allocated = "1.361 GiB"
	┌ Info: Finished model 3
	│ time = 0.63013394675
	│ time_per_epoch = 0.010502232445833334
	│ gc = "71.2%"
	└ allocated = "1.303 GiB"
	┌ Info: Finished model 4
	│ time = 0.6284951171
	│ time_per_epoch = 0.00785618896375
	│ gc = "69.2%"
	└ allocated = "1.258 GiB"
	┌ Info: Finished model 5
	│ time = 0.5810092911
	│ time_per_epoch = 0.005810092911
	│ gc = "71.5%"
	└ allocated = "1.288 GiB"
	┌ Info: Finished model 6
	│ time = 0.5578616940000001
	│ time_per_epoch = 0.004648847450000001
	│ gc = "68.6%"
	└ allocated = "1.197 GiB"
	┌ Info: Finished model 7
	│ time = 0.5249345326
	│ time_per_epoch = 0.003749532375714286
	│ gc = "67.2%"
	└ allocated = "1.146 GiB"
	┌ Info: Finished model 8
	│ time = 0.4802209392000001
	│ time_per_epoch = 0.0030013808700000005
	│ gc = "70.2%"
	└ allocated = "1.100 GiB"
	┌ Info: Finished model 9
	│ time = 0.4712065032
	│ time_per_epoch = 0.0026178139066666667
	│ gc = "66.0%"
	└ allocated = "1.059 GiB"
	┌ Info: Finished model 10
	│ time = 0.46222509810000006
	│ time_per_epoch = 0.0023111254905000002
	│ gc = "67.5%"
	└ allocated = "1.037 GiB"
	┌ Info: Finished model 11
	│ time = 0.41497993315
	│ time_per_epoch = 0.0018862724234090908
	│ gc = "65.9%"
	└ allocated = "996.651 MiB"
	┌ Info: Finished model 12
	│ time = 0.39126343464999996
	│ time_per_epoch = 0.0016302643110416666
	│ gc = "64.4%"
	└ allocated = "950.995 MiB"
	┌ Info: Finished model 13
	│ time = 0.382742247
	│ time_per_epoch = 0.0014720855653846154
	│ gc = "64.0%"
	└ allocated = "916.340 MiB"
	┌ Info: Finished model 14
	│ time = 0.34288418504999996
	│ time_per_epoch = 0.0012245863751785713
	│ gc = "64.1%"
	└ allocated = "852.925 MiB"
	┌ Info: Finished model 15
	│ time = 0.29296357749999996
	│ time_per_epoch = 0.0009765452583333332
	│ gc = "62.7%"
	└ allocated = "791.917 MiB"
	┌ Info: Finished model 16
	│ time = 0.27802489244999995
	│ time_per_epoch = 0.0008688277889062499
	│ gc = "63.1%"
	└ allocated = "747.786 MiB"
	┌ Info: Finished model 17
	│ time = 0.27814960005
	│ time_per_epoch = 0.0008180870589705883
	│ gc = "62.4%"
	└ allocated = "774.939 MiB"
	┌ Info: Finished model 18
	│ time = 0.22394830665
	│ time_per_epoch = 0.0006220786295833334
	│ gc = "60.7%"
	└ allocated = "663.826 MiB"
	┌ Info: Finished model 19
	│ time = 0.23692811540000003
	│ time_per_epoch = 0.0006234950405263159
	│ gc = "60.3%"
	└ allocated = "657.192 MiB"
	┌ Info: Finished model 20
	│ time = 0.1871665368
	│ time_per_epoch = 0.000467916342
	│ gc = "58.2%"
	└ allocated = "583.980 MiB"
	┌ Info: Finished model 21
	│ time = 0.16696599799999998
	│ time_per_epoch = 0.0003975380904761904
	│ gc = "60.0%"
	└ allocated = "573.366 MiB"
	┌ Info: Finished model 22
	│ time = 0.1810929687
	│ time_per_epoch = 0.00041157492886363634
	│ gc = "57.9%"
	└ allocated = "555.059 MiB"
	┌ Info: Finished model 24
	│ time = 0.134708169
	│ time_per_epoch = 0.00028064201875
	│ gc = "55.2%"
	└ allocated = "438.462 MiB"
	┌ Info: Finished model 23
	│ time = 0.14440032309999998
	│ time_per_epoch = 0.0003139137458695652
	│ gc = "53.9%"
	└ allocated = "469.888 MiB"
	┌ Info: Finished model 25
	│ time = 0.11250617895
	│ time_per_epoch = 0.0002250123579
	│ gc = "53.4%"
	└ allocated = "350.242 MiB"
	┌ Info: Finished model 26
	│ time = 0.09209255605
	│ time_per_epoch = 0.00017710106932692307
	│ gc = "46.9%"
	└ allocated = "283.863 MiB"
	┌ Info: Finished model 28
	│ time = 0.08827645390000001
	│ time_per_epoch = 0.0001576365248214286
	│ gc = "43.5%"
	└ allocated = "250.151 MiB"
	┌ Info: Finished model 29
	│ time = 0.07947673274999997
	│ time_per_epoch = 0.00013702884956896547
	│ gc = "38.1%"
	└ allocated = "220.892 MiB"
	┌ Info: Finished model 27
	│ time = 0.09358652614999999
	│ time_per_epoch = 0.00017330838175925924
	│ gc = "38.2%"
	└ allocated = "248.892 MiB"
	┌ Info: Finished model 31
	│ time = 0.06465937009999999
	│ time_per_epoch = 0.0001042893066129032
	│ gc = "28.0%"
	└ allocated = "126.471 MiB"
	┌ Info: Finished model 30
	│ time = 0.07568235385
	│ time_per_epoch = 0.00012613725641666667
	│ gc = "21.4%"
	└ allocated = "106.260 MiB"
	┌ Info: Finished model 32
	│ time = 0.041342464450000005
	│ time_per_epoch = 6.459760070312501e-5
	│ gc = "17.9%"
	└ allocated = "45.388 MiB"
	[ Info: Finished training