mcabbott/layer_norm.jl Secret

## layer_norm.jl

##### Some code I had for normalisation

using ChainRulesCore, Statistics

function normal_now(x::AbstractArray)  # like what LayerNorm does in Flux
    μ = mean(x; dims=1)
    σ2 = var(x; dims=1, mean=μ, corrected=false)
    ε = eps(eltype(x))
    (x .- μ) ./ sqrt.(σ2 .+ ε)
end

function normal_new(x::AbstractArray)  # improved version
    μ = mean(x; dims=1)
    σ2 = var(x; dims=1, mean=μ, corrected=false)
    ε = eps(eltype(x))
    istd = inv.(sqrt.(σ2.+ε))  # this is the only idea -- don't do sqrt N^2 times
    y = (x .- μ) .* istd
    # y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε)  # this is quicker on GPU
end

# Mostly I thought a bit about how to write a fused gradient here:

function ChainRulesCore.rrule(::typeof(normal_new), x::AbstractArray)
    μ = mean(x; dims=1)
    σ2 = var(x; dims=1, mean=μ, corrected=false)
    ε = eps(eltype(x))
    istd = inv.(sqrt.(σ2.+ε))
    y = (x .- μ) .* istd
    # y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε)  # quicker on GPU
    iN = one(eltype(y))/size(x,1)
    function back(Δ)
        dy = unthunk(Δ)
        # dx = istd.*(dy .- mean(dy; dims=1) .- y.*istd.*mean(dy.*(x.-μ); dims=1))

        # Like this, uses 38% of the memory:
        # dx = istd.*(dy .- iN .* sum(dy; dims=1) .- y .* iN .* istd .* sum(dy.*(x.-μ); dims=1))

        # Like this, uses 25%, I said. Should be safe in a jacobian.
        tmp = dy.*(x.-μ)
        dx = tmp .= istd.*(dy .- iN .* sum(dy; dims=1) .- y .* iN .* istd .* sum(tmp; dims=1))
        # dx = @fastmath tmp .= (dy .- iN .* (sum(dy; dims=1) .- y .* sum(tmp; dims=1)) ./ sqrt.(σ2.+ε))
        (NoTangent(), dx)
    end
    y, back
end

##### Newer version...

using GPUArraysCore, Statistics, ChainRulesCore

function mean_var(A::AbstractArray; dims, corrected::Bool=false)
    μ = mean(A; dims)
    σ2 = var(A; dims, corrected, mean=μ)
    (; mean=μ, var=σ2)
end

# This is something like Welford's algorithm. It's pretty slow on CPU.
function mean_var(A::AbstractGPUArray{Float32}; dims, corrected::Bool=false)
    init = (zero(Float32), zero(Float32), Int32(0))
    tri(x::Float32) = (x, zero(Float32), Int32(1))
    N = sum_length(A, dims)
    λ = Float32(1/(N-corrected))
    function red(tup1::Tuple, tup2::Tuple)
        m1, v1, k1 = tup1
        m2, v2, k2 = tup2
        k = k1 + k2
        iszero(k) && return init  # this is essential for GPU!
        invk = inv(Float32(k))
        m = (k1 * m1 + k2 * m2) * invk
        v = v1 + v2 + λ * (m2 - m1)^2 * k1 * k2 * invk
        (Float32(m), Float32(v), Int32(k))
    end
    R = mapreduce(tri, red, A; dims, init)
    mean, var, _ = if dims isa Colon
        R
    else
        R3 = reinterpret(reshape, Float32, R)
        eachslice(R3, dims=1)  # produces view(::CuArray), seems OK
    end
    (; mean, var)
end

sum_length(x::AbstractArray, dims::Integer) = size(x, dims)
sum_length(x::AbstractArray, dims::Colon) = length(x)
sum_length(x::AbstractArray, dims) = prod(size(x,d) for d in unique(dims); init=1)

# Here too it seems to pay to have a separate path for GPU arrays
function normal_newer(x::AbstractArray; dims=1)
    μ, σ2 = mean_var(x; dims, corrected=false)
    ε = eps(eltype(x))
    istd = inv.(sqrt.(σ2.+ε))  # avoids N^2 sqrt evals
    y = (x .- μ) .* istd
end
function normal_newer(x::AbstractGPUArray; dims=1)
    μ, σ2 = mean_var(x; dims, corrected=false)
    ε = eps(eltype(x))
    y = (x .- μ) ./ sqrt.(σ2.+ε)  # this is quicker on GPU
end

# Two paths here starts to get really ugly...
function ChainRulesCore.rrule(::typeof(normal_newer), x::AbstractArray; dims=1)
    μ, σ2 = mean_var(x; dims, corrected=false)
    ε = eps(eltype(x))
    istd = inv.(sqrt.(σ2.+ε))
    y = (x .- μ) .* istd
    iN = one(eltype(y))/sum_length(x, dims)
    function back(Δ)
        dy = unthunk(Δ)
        tmp = dy .* (x .- μ)
        dx = tmp .= istd.*(dy .- iN .* sum(dy; dims=1) .- y .* iN .* istd .* sum(tmp; dims=1))
        (NoTangent(), dx)
    end
    y, back
end
function ChainRulesCore.rrule(::typeof(normal_newer), x::AbstractGPUArray; dims=1)
    μ, σ2 = mean_var(x; dims, corrected=false)
    ε = eps(eltype(x))
    y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε)  # quicker on GPU
    iN = one(eltype(y))/sum_length(x, dims)
    function back(Δ)
        dy = unthunk(Δ)
        tmp = dy .* (x .- μ)
        dx = @fastmath tmp .= (dy .- iN .* (sum(dy; dims) .- y .* sum(tmp; dims)) ./ sqrt.(σ2.+ε))
        (NoTangent(), dx)
    end
    y, back
end


##### Some code from https://github.com/FluxML/NNlib.jl/pull/452/files

 import ChainRulesCore: rrule, @ignore_derivatives

 function norm_stats(x, dims)
     μ = mean(x; dims)
     σ² = var(x; dims, mean = μ, corrected = false)
     return μ, σ²
 end

 function rrule(::typeof(norm_stats), x, dims)
     μ, mean_pullback = rrule(mean, x; dims)
     σ², var_pullback = rrule(var, x; dims, mean = μ, corrected = false)
     function norm_stats_pullback(dargs)
         dμ, dσ² = unthunk(dargs)
         dx = ChainRulesCore.add!!(var_pullback(dμ)[2], mean_pullback(dσ²)[2])
         return (NoTangent(), dx, NoTangent())
     end
     return (μ, σ²), norm_stats_pullback
 end

 _maybe_reshape(::Nothing, _) = nothing
 _maybe_reshape(x, dims) = reshape(x, dims)
 _apply_scale_bias(x, ::Nothing, ::Nothing) = x
 _apply_scale_bias(x, scale, bias) = x .* scale .+ bias

 ofeltype(x, y) = convert(float(eltype(x)), y)

 function norm_helper(x, μ, σ², scale::Union{AbstractArray, Nothing},
                      bias::Union{AbstractArray, Nothing}, ϵ::Real, affine_size = size(μ))
     @ignore_derivatives if isnothing(scale) != isnothing(bias)
         error("both scale and bias must be provided or left as nothing")
     end
     scale′, bias′ = _maybe_reshape(scale, affine_size), _maybe_reshape(bias, affine_size)
     return _apply_scale_bias((x .- μ) ./ sqrt.(σ² .+ ϵ), scale′, bias′)
 end

 function layernorm(x::AbstractArray{<:Any, N}, ::Val{S} = Val(1), scale = nothing, bias = nothing,
                    ϵ = ofeltype(x, 1e-5)) where {N, S}
     @ignore_derivatives if S > N
         throw(DimensionMismatch("got $S reduction dims for $N-dimensional array"))
     end
     μ, σ² = norm_stats(x, ntuple(identity, S))
     return norm_helper(x, μ, σ², scale, bias, ϵ, size(x)[1:S])
 end

##### Some code from https://github.com/chengchingwen/NeuralAttentionlib.jl/blob/b418c0d2a9e99c960e88879a5fd879d47d8e4c22/src/functional/layernorm.jl

_x_x2(x) = (x, x * x)
_x_y2(x, y) = (x, x * y)

function _normalize(inN::T, ϵ::T, x::T, sum_sum2::NTuple{2, T}) where T
    μ, s = sum_sum2 .* inN
    σ₀ = sqrt(fma(μ, -μ, s))  # @fastmath sqrt helps too, 2x?
    σ = max(σ₀, ϵ)  # @fastmath max helps 30% here
    return (x - μ) / σ
end

function _rstd(inN::T, ϵ::T, sum_sum2::NTuple{2, T}) where T
    μ, s = sum_sum2 .* inN
    σ₀ = sqrt(fma(μ, -μ, s))
    σ = max(σ₀, ϵ)  # @fastmath max has no effect here
    return inv(σ)
end

layer_norm(alpha, beta, x) = layer_norm(1e-5, alpha, beta, x)
function layer_norm(epsilon, alpha, beta, x)
    T = eltype(x)
    N = size(x, 1)
    ϵ = convert(T, epsilon)
    α = isnothing(alpha) ? one(T) : alpha
    β = isnothing(beta) ? zero(T) : beta
    # This makes an array of tuples, sum_sum2 == tuple.(sum(x; dims), sum(abs2, x; dims))
    sum_sum2 = mapreduce(_x_x2, .+, x; dims=1, init = (zero(T), zero(T)))
    # And this calls sqrt on all N^2 elements
    return fma.(α, _normalize.(convert(T, 1//N), ϵ, x, sum_sum2), β)
end

_fma2(dy::T, dya::NTuple{2, T}, n::T, inN::T, is::T) where T = fma(fma(n, last(dya), first(dya)), inN, dy) * is
function Δlayer_norm_dx(Ȳ, ϵ, α, n, x, sum_sum2)
    T = eltype(x)
    N = size(x, 1)
    is = Broadcast.instantiate(Broadcast.broadcasted(_rstd, convert(T, 1//N), ϵ, sum_sum2))
    dy = Broadcast.instantiate(Broadcast.broadcasted(*, Ȳ, α))
    # This mapreduce(f, op, a, b) will be slow on CPU, map then reduce
    dya = mapreduce(_x_y2, .+, dy, n; dims=1, init=(zero(T), zero(T)))
    ∂x = _fma2.(dy, dya, n, -convert(T, 1//N), is)
    return ∂x
end

_taildims(Ȳ) = Base.tail(ntuple(identity, Val(ndims(Ȳ))))

function ChainRulesCore.rrule(::typeof(layer_norm), alpha, beta, x)
    y, pullback = rrule(layer_norm, 1e-5, alpha, beta, x)
    layer_norm_pullback(Ȳ) = (NoTangent(), last_n(pullback(Ȳ), static(3))...)
    return y, layer_norm_pullback
end

function ChainRulesCore.rrule(::typeof(layer_norm), epsilon, alpha, beta, x)
    T = eltype(x)
    N = size(x, 1)
    ϵ = convert(T, epsilon)
    cα = static(isnothing(alpha))
    cβ = static(isnothing(beta))
    aα = static(alpha isa AbstractArray)
    aβ = static(beta isa AbstractArray)
    α = as_bool(cα) ? one(T) : alpha
    β = as_bool(cβ) ? zero(T) : beta
    sum_sum2 = mapreduce(_x_x2, .+, x; dims=1, init = (zero(T), zero(T)))
    n = _normalize.(convert(T, 1//N), ϵ, x, sum_sum2)
    y = fma.(α, n, β)
    function layer_norm_pullback(Ybar)
        Ȳ = unthunk(Ybar)
        ∂α = as_bool(cα) ? NoTangent() : @thunk sum(
            Broadcast.instantiate(Broadcast.broadcasted(*, Ȳ, n));
            dims = as_bool(aα) ? _taildims(Ȳ) : :, init = zero(eltype(Ȳ))
        )
        ∂β = as_bool(cβ) ? NoTangent() : @thunk sum(Ȳ; dims = as_bool(aβ) ? _taildims(Ȳ) : :)
        ∂x = @thunk Δlayer_norm_dx(Ȳ, ϵ, α, n, x, sum_sum2)
        return (NoTangent(), NoTangent(), ∂α, ∂β, ∂x)
    end
    return y, layer_norm_pullback
end

using Static
as_bool(b::Bool) = b
as_bool(b::StaticBool) = Bool(b)

function last_n(s::Tuple, n)
    offset = static(length(s)) - n
    ntuple(i->s[offset + i], n)
end


##### Testing forwards

x = rand(Int8, 4, 5).+0.0
normal_now(x)
normal_new(x)
normal_newer(x)

layernorm(x, Val(1))  # so I made Val(1) the default
layer_norm(true, false, x)
layer_norm(nothing, nothing, x)  # also encodes this


julia> @btime normal_now($(rand(100, 100)));  # most of the time in sqrt!
  min 14.541 μs, mean 19.148 μs (10 allocations, 80.45 KiB)
  min 12.875 μs, mean 16.387 μs (10 allocations, 80.45 KiB)  # with @fastmath sqrt

julia> @btime normal_new($(rand(100, 100)));  # much quicker, same memory
  min 5.048 μs, mean 8.829 μs (11 allocations, 81.33 KiB)
  min 5.056 μs, mean 9.781 μs (11 allocations, 81.33 KiB)  # with @fastmath sqrt

julia> @btime normal_newer($(rand(100, 100)));
  min 5.076 μs, mean 11.021 μs (11 allocations, 81.33 KiB)

julia> @btime layernorm($(rand(100, 100)));
  min 14.500 μs, mean 18.573 μs (18 allocations, 80.73 KiB)

julia> @btime layer_norm(true, false, $(rand(100, 100)));
  min 16.834 μs, mean 20.344 μs (3 allocations, 79.94 KiB)  # as above
  min 6.917 μs, mean 10.702 μs (3 allocations, 79.94 KiB)   # with @fastmath max & sqrt


#=
# Components, to see where the time is...

@btime copy($(rand(100, 100)));  # 78.17 KiB

@btime sum($(rand(100, 100)); dims=1);
m1 = @btime mean($(rand(100, 100)); dims=1);

@btime var($(rand(100, 100)); dims=1, mean=$m1, corrected=false);  # does not allocate a copy
@btime std($(rand(100, 100)); dims=1, mean=$m1, corrected=false);

@btime sqrt.($m1);
@btime inv.(sqrt.($m1));
@btime inv.(sqrt.($(rand(100, 100))));

=#

##### Testing gradient
# Can save a lot of memory, but not much speedup, at least on M1 mac.

using Zygote, BenchmarkTools

Zygote.gradient(x -> sum(normal_now(x)[1,:]), x)[1]
Zygote.gradient(x -> sum(normal_new(x)[1,:]), x)[1]
Zygote.gradient(x -> sum(normal_newer(x)[1,:]), x)[1]

Zygote.gradient(x -> sum(layernorm(x)[1,:]), x)[1]  # this does not look the same
Zygote.gradient(x -> sum(layer_norm(true, false, x)[1,:]), x)[1]


julia> @btime Zygote.gradient(x -> sum(abs2, x), $(rand(100, 100)));  # baseline, no norm!
  min 3.391 μs, mean 11.354 μs (2 allocations, 78.17 KiB)

julia> @btime Zygote.gradient(x -> sum(abs2, normal_now(x)), $(rand(100, 100)));
  min 26.334 μs, mean 85.340 μs (57 allocations, 637.97 KiB)

julia> @btime Zygote.gradient(x -> sum(abs2, normal_new(x)), $(rand(100, 100)));
  min 26.750 μs, mean 48.510 μs (33 allocations, 240.11 KiB)

julia> @btime Zygote.gradient(x -> sum(abs2, normal_newer(x)), $(rand(100, 100)));
  min 26.959 μs, mean 53.320 μs (33 allocations, 240.11 KiB)

julia> (637.97-78.17) / (240.11-78.17)
3.4568358651352358

julia> @btime Zygote.gradient(x -> sum(abs2, layernorm(x)), $(rand(100, 100)));
  min 81.666 μs, mean 130.307 μs (252 allocations, 570.58 KiB)

julia> @btime Zygote.gradient(x -> sum(abs2, layer_norm(true, false, x)), $(rand(100, 100)));
  min 77.500 μs, mean 125.485 μs (37 allocations, 473.62 KiB)


##### GPU times
# The conclusion here is that NVIDIA did this optimisation better than I did.
# No point optimising GPU case; can LayerNorm call BatchNorm's backend too?

using CUDA, Flux

julia> cx = cu(randn(100, 1000));

julia> CUDA.@time cx .+ 1;  # baseline
  0.000124 seconds (40 CPU allocations: 1.750 KiB) (1 GPU allocation: 390.625 KiB, 16.67% memmgmt time)

julia> CUDA.@time Flux.normalise(cx);
  0.000431 seconds (441 CPU allocations: 21.453 KiB) (9 GPU allocations: 784.766 KiB, 11.47% memmgmt time)

julia> CUDA.@time normal_now(cx);
  0.000292 seconds (258 CPU allocations: 11.516 KiB) (6 GPU allocations: 796.875 KiB, 13.49% memmgmt time)

julia> CUDA.@time normal_new(cx);
  0.000307 seconds (296 CPU allocations: 13.203 KiB) (7 GPU allocations: 800.781 KiB, 13.97% memmgmt time)


# gradients

julia> CUDA.@time Zygote.gradient(x -> sum(sin, x), cx);  # baseline
  0.060339 seconds (16.40 k CPU allocations: 963.312 KiB) (6 GPU allocations: 1.908 MiB, 0.13% memmgmt time)

julia> CUDA.@time Zygote.gradient(x -> sum(sin, Flux.normalise(x)), cx);
  0.156524 seconds (38.36 k CPU allocations: 2.249 MiB) (29 GPU allocations: 5.348 MiB, 0.60% memmgmt time)

julia> CUDA.@time Zygote.gradient(x -> sum(sin, normal_now(x)), cx);
  0.079000 seconds (23.59 k CPU allocations: 1.313 MiB) (28 GPU allocations: 5.387 MiB, 0.21% memmgmt time)

julia> CUDA.@time Zygote.gradient(x -> sum(sin, normal_new(x)), cx);
  0.074505 seconds (24.45 k CPU allocations: 1.396 MiB) (16 GPU allocations: 3.079 MiB, 0.15% memmgmt time)


# batchnorm
# bn2 = BatchNorm(100, affine=false) |> gpu  # gives an error!
# But what does work, is more efficient than mine: 2.672 MiB < 3.079 MiB

julia> bn1 = BatchNorm(100) |> gpu
BatchNorm(100)      # 200 parameters, plus 200 non-trainable

julia> CUDA.@time bn1(cx);
  0.000291 seconds (42 CPU allocations: 1.312 KiB) (1 GPU allocation: 390.625 KiB, 7.79% memmgmt time)

julia> CUDA.@time Zygote.gradient(x -> sum(sin, bn1(x)), cx);
  0.099942 seconds (36.84 k CPU allocations: 2.064 MiB) (10 GPU allocations: 2.672 MiB, 0.13% memmgmt time)


# layernorm
# here it is literally calling Flux.normalise
# When the dims work, can it call BatchNorm instead?

julia> ln1 = LayerNorm(100) |> gpu
LayerNorm(100)      # 200 parameters

julia> ln2 = LayerNorm(100, affine=false) |> gpu
LayerNorm(100)      # 200 parameters

julia> CUDA.@time ln2(cx);
  0.000369 seconds (320 CPU allocations: 15.688 KiB) (7 GPU allocations: 800.781 KiB, 16.56% memmgmt time)

julia> CUDA.@time Zygote.gradient(x -> sum(sin, ln2(x)), cx);
  0.136035 seconds (37.73 k CPU allocations: 2.104 MiB, 23.66% gc time) (26 GPU allocations: 5.376 MiB, 0.19% memmgmt time)


##### GPU, January
# Now updated to include https://github.com/JuliaGPU/GPUArrays.jl/pull/443

julia> let x = CUDA.randn(100, 1000)  # Forward pass, @btime
           @btime CUDA.@sync copy($x)  # baseline
           println()
           @btime CUDA.@sync normal_now($x)
           @btime CUDA.@sync normal_new($x)
           @btime CUDA.@sync normal_newer($x)
           println()
           @btime CUDA.@sync layernorm($x)
           @btime CUDA.@sync layer_norm(nothing, nothing, $x)
           println()
           μ = @btime CUDA.@sync mean($x; dims=1)
           @btime CUDA.@sync var($x; mean=$μ, corrected=false, dims=1)
       end;
  17.749 μs (13 allocations: 400 bytes)

  96.518 μs (219 allocations: 10.88 KiB)
  106.690 μs (257 allocations: 12.56 KiB)
  56.772 μs (144 allocations: 9.45 KiB)

  93.800 μs (225 allocations: 11.41 KiB)
  43.795 μs (93 allocations: 4.33 KiB)

  43.810 μs (99 allocations: 4.59 KiB)
  41.918 μs (92 allocations: 5.17 KiB)

julia> let x = CUDA.randn(1000, 10_000)  # Forward pass, alloc
           CUDA.@time copy(x)  # baseline
           println()
           CUDA.@time normal_now(x)
           CUDA.@time normal_new(x)  # surprisingly large alloc, why? fixed.
           CUDA.@time normal_newer(x)
           println()
           CUDA.@time layernorm(x)
           CUDA.@time layer_norm(nothing, nothing, x)
           println()
           μ = CUDA.@time mean(x; dims=1)
           CUDA.@time var(x; mean=μ, corrected=false, dims=1)  # allocates a big array... before 443
       end;
  0.000192 seconds (13 CPU allocations: 400 bytes) (1 GPU allocation: 38.147 MiB, 13.82% memmgmt time)

  0.001525 seconds (279 CPU allocations: 14.281 KiB) (4 GPU allocations: 38.261 MiB, 1.92% memmgmt time)
  0.000904 seconds (315 CPU allocations: 15.859 KiB) (5 GPU allocations: 38.300 MiB, 3.44% memmgmt time)
  0.006859 seconds (216 CPU allocations: 13.625 KiB) (2 GPU allocations: 38.261 MiB, 0.36% memmgmt time)

  0.009269 seconds (365 CPU allocations: 19.094 KiB) (4 GPU allocations: 38.261 MiB, 0.38% memmgmt time)
  0.005154 seconds (110 CPU allocations: 5.188 KiB) (2 GPU allocations: 38.223 MiB, 0.44% memmgmt time)

  0.000320 seconds (101 CPU allocations: 4.625 KiB) (2 GPU allocations: 78.125 KiB, 3.75% memmgmt time)
  0.015003 seconds (156 CPU allocations: 8.453 KiB) (1 GPU allocation: 39.062 KiB, 0.09% memmgmt time)

julia> let x = CUDA.randn(100, 1000) # Gradient, @btime
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, x), $x)  # baseline, no norm!
           println()
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_now(x)), $x)
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_new(x)), $x)
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_newer(x)), $x)
           println()
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, layernorm(x, Val(1))), $x)
           @btime CUDA.@sync Zygote.gradient(x -> sum(abs2, layer_norm(nothing, nothing, x)), $x)
        end
  176.829 μs (325 allocations: 14.14 KiB)

  685.050 μs (1250 allocations: 60.31 KiB)
  413.809 μs (789 allocations: 42.55 KiB)
  373.748 μs (666 allocations: 39.12 KiB)

  993.590 μs (1494 allocations: 74.17 KiB)
  365.185 μs (565 allocations: 25.61 KiB)

julia> let x = CUDA.randn(1000, 10_000)  # Gradient, alloc
           CUDA.@time Zygote.gradient(x -> sum(abs2, x), x)  # baseline, no norm!
           println()
           CUDA.@time Zygote.gradient(x -> sum(abs2, normal_now(x)), x)
           CUDA.@time Zygote.gradient(x -> sum(abs2, normal_new(x)), x)
           CUDA.@time Zygote.gradient(x -> sum(abs2, normal_newer(x)), x)
           println()
           CUDA.@time Zygote.gradient(x -> sum(abs2, layernorm(x, Val(1))), x)
           CUDA.@time Zygote.gradient(x -> sum(abs2, layer_norm(nothing, nothing, x)), x)
       end;
  0.055923 seconds (16.29 k CPU allocations: 959.204 KiB) (6 GPU allocations: 190.735 MiB, 0.09% memmgmt time)

  0.079773 seconds (23.45 k CPU allocations: 1.308 MiB) (26 GPU allocations: 458.222 MiB, 0.18% memmgmt time)
  0.076155 seconds (24.29 k CPU allocations: 1.388 MiB) (14 GPU allocations: 267.258 MiB, 0.12% memmgmt time)
  0.079763 seconds (24.18 k CPU allocations: 1.392 MiB) (11 GPU allocations: 267.220 MiB, 0.11% memmgmt time)

  0.106857 seconds (30.34 k CPU allocations: 1.698 MiB) (27 GPU allocations: 496.369 MiB, 0.15% memmgmt time)
  0.076044 seconds (24.26 k CPU allocations: 1.388 MiB) (11 GPU allocations: 305.329 MiB, 0.11% memmgmt time)

	##### Some code I had for normalisation

	using ChainRulesCore, Statistics

	function normal_now(x::AbstractArray) # like what LayerNorm does in Flux
	μ = mean(x; dims=1)
	σ2 = var(x; dims=1, mean=μ, corrected=false)
	ε = eps(eltype(x))
	(x .- μ) ./ sqrt.(σ2 .+ ε)
	end

	function normal_new(x::AbstractArray) # improved version
	μ = mean(x; dims=1)
	σ2 = var(x; dims=1, mean=μ, corrected=false)
	ε = eps(eltype(x))
	istd = inv.(sqrt.(σ2.+ε)) # this is the only idea -- don't do sqrt N^2 times
	y = (x .- μ) .* istd
	# y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε) # this is quicker on GPU
	end

	# Mostly I thought a bit about how to write a fused gradient here:

	function ChainRulesCore.rrule(::typeof(normal_new), x::AbstractArray)
	μ = mean(x; dims=1)
	σ2 = var(x; dims=1, mean=μ, corrected=false)
	ε = eps(eltype(x))
	istd = inv.(sqrt.(σ2.+ε))
	y = (x .- μ) .* istd
	# y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε) # quicker on GPU
	iN = one(eltype(y))/size(x,1)
	function back(Δ)
	dy = unthunk(Δ)
	# dx = istd.(dy .- mean(dy; dims=1) .- y.istd.mean(dy.(x.-μ); dims=1))

	# Like this, uses 38% of the memory:
	# dx = istd.(dy .- iN . sum(dy; dims=1) .- y .* iN .* istd .* sum(dy.*(x.-μ); dims=1))

	# Like this, uses 25%, I said. Should be safe in a jacobian.
	tmp = dy.*(x.-μ)
	dx = tmp .= istd.(dy .- iN . sum(dy; dims=1) .- y .* iN .* istd .* sum(tmp; dims=1))
	# dx = @fastmath tmp .= (dy .- iN .* (sum(dy; dims=1) .- y .* sum(tmp; dims=1)) ./ sqrt.(σ2.+ε))
	(NoTangent(), dx)
	end
	y, back
	end

	##### Newer version...

	using GPUArraysCore, Statistics, ChainRulesCore

	function mean_var(A::AbstractArray; dims, corrected::Bool=false)
	μ = mean(A; dims)
	σ2 = var(A; dims, corrected, mean=μ)
	(; mean=μ, var=σ2)
	end

	# This is something like Welford's algorithm. It's pretty slow on CPU.
	function mean_var(A::AbstractGPUArray{Float32}; dims, corrected::Bool=false)
	init = (zero(Float32), zero(Float32), Int32(0))
	tri(x::Float32) = (x, zero(Float32), Int32(1))
	N = sum_length(A, dims)
	λ = Float32(1/(N-corrected))
	function red(tup1::Tuple, tup2::Tuple)
	m1, v1, k1 = tup1
	m2, v2, k2 = tup2
	k = k1 + k2
	iszero(k) && return init # this is essential for GPU!
	invk = inv(Float32(k))
	m = (k1 * m1 + k2 * m2) * invk
	v = v1 + v2 + λ * (m2 - m1)^2 * k1 * k2 * invk
	(Float32(m), Float32(v), Int32(k))
	end
	R = mapreduce(tri, red, A; dims, init)
	mean, var, _ = if dims isa Colon
	R
	else
	R3 = reinterpret(reshape, Float32, R)
	eachslice(R3, dims=1) # produces view(::CuArray), seems OK
	end
	(; mean, var)
	end

	sum_length(x::AbstractArray, dims::Integer) = size(x, dims)
	sum_length(x::AbstractArray, dims::Colon) = length(x)
	sum_length(x::AbstractArray, dims) = prod(size(x,d) for d in unique(dims); init=1)

	# Here too it seems to pay to have a separate path for GPU arrays
	function normal_newer(x::AbstractArray; dims=1)
	μ, σ2 = mean_var(x; dims, corrected=false)
	ε = eps(eltype(x))
	istd = inv.(sqrt.(σ2.+ε)) # avoids N^2 sqrt evals
	y = (x .- μ) .* istd
	end
	function normal_newer(x::AbstractGPUArray; dims=1)
	μ, σ2 = mean_var(x; dims, corrected=false)
	ε = eps(eltype(x))
	y = (x .- μ) ./ sqrt.(σ2.+ε) # this is quicker on GPU
	end

	# Two paths here starts to get really ugly...
	function ChainRulesCore.rrule(::typeof(normal_newer), x::AbstractArray; dims=1)
	μ, σ2 = mean_var(x; dims, corrected=false)
	ε = eps(eltype(x))
	istd = inv.(sqrt.(σ2.+ε))
	y = (x .- μ) .* istd
	iN = one(eltype(y))/sum_length(x, dims)
	function back(Δ)
	dy = unthunk(Δ)
	tmp = dy .* (x .- μ)
	dx = tmp .= istd.(dy .- iN . sum(dy; dims=1) .- y .* iN .* istd .* sum(tmp; dims=1))
	(NoTangent(), dx)
	end
	y, back
	end
	function ChainRulesCore.rrule(::typeof(normal_newer), x::AbstractGPUArray; dims=1)
	μ, σ2 = mean_var(x; dims, corrected=false)
	ε = eps(eltype(x))
	y = @fastmath (x .- μ) ./ sqrt.(σ2.+ε) # quicker on GPU
	iN = one(eltype(y))/sum_length(x, dims)
	function back(Δ)
	dy = unthunk(Δ)
	tmp = dy .* (x .- μ)
	dx = @fastmath tmp .= (dy .- iN .* (sum(dy; dims) .- y .* sum(tmp; dims)) ./ sqrt.(σ2.+ε))
	(NoTangent(), dx)
	end
	y, back
	end



	##### Some code from https://github.com/FluxML/NNlib.jl/pull/452/files

	import ChainRulesCore: rrule, @ignore_derivatives

	function norm_stats(x, dims)
	μ = mean(x; dims)
	σ² = var(x; dims, mean = μ, corrected = false)
	return μ, σ²
	end

	function rrule(::typeof(norm_stats), x, dims)
	μ, mean_pullback = rrule(mean, x; dims)
	σ², var_pullback = rrule(var, x; dims, mean = μ, corrected = false)
	function norm_stats_pullback(dargs)
	dμ, dσ² = unthunk(dargs)
	dx = ChainRulesCore.add!!(var_pullback(dμ)[2], mean_pullback(dσ²)[2])
	return (NoTangent(), dx, NoTangent())
	end
	return (μ, σ²), norm_stats_pullback
	end

	_maybe_reshape(::Nothing, _) = nothing
	_maybe_reshape(x, dims) = reshape(x, dims)
	_apply_scale_bias(x, ::Nothing, ::Nothing) = x
	_apply_scale_bias(x, scale, bias) = x .* scale .+ bias

	ofeltype(x, y) = convert(float(eltype(x)), y)

	function norm_helper(x, μ, σ², scale::Union{AbstractArray, Nothing},
	bias::Union{AbstractArray, Nothing}, ϵ::Real, affine_size = size(μ))
	@ignore_derivatives if isnothing(scale) != isnothing(bias)
	error("both scale and bias must be provided or left as nothing")
	end
	scale′, bias′ = _maybe_reshape(scale, affine_size), _maybe_reshape(bias, affine_size)
	return _apply_scale_bias((x .- μ) ./ sqrt.(σ² .+ ϵ), scale′, bias′)
	end

	function layernorm(x::AbstractArray{<:Any, N}, ::Val{S} = Val(1), scale = nothing, bias = nothing,
	ϵ = ofeltype(x, 1e-5)) where {N, S}
	@ignore_derivatives if S > N
	throw(DimensionMismatch("got $S reduction dims for $N-dimensional array"))
	end
	μ, σ² = norm_stats(x, ntuple(identity, S))
	return norm_helper(x, μ, σ², scale, bias, ϵ, size(x)[1:S])
	end

	##### Some code from https://github.com/chengchingwen/NeuralAttentionlib.jl/blob/b418c0d2a9e99c960e88879a5fd879d47d8e4c22/src/functional/layernorm.jl

	_x_x2(x) = (x, x * x)
	_x_y2(x, y) = (x, x * y)

	function _normalize(inN::T, ϵ::T, x::T, sum_sum2::NTuple{2, T}) where T
	μ, s = sum_sum2 .* inN
	σ₀ = sqrt(fma(μ, -μ, s)) # @fastmath sqrt helps too, 2x?
	σ = max(σ₀, ϵ) # @fastmath max helps 30% here
	return (x - μ) / σ
	end

	function _rstd(inN::T, ϵ::T, sum_sum2::NTuple{2, T}) where T
	μ, s = sum_sum2 .* inN
	σ₀ = sqrt(fma(μ, -μ, s))
	σ = max(σ₀, ϵ) # @fastmath max has no effect here
	return inv(σ)
	end

	layer_norm(alpha, beta, x) = layer_norm(1e-5, alpha, beta, x)
	function layer_norm(epsilon, alpha, beta, x)
	T = eltype(x)
	N = size(x, 1)
	ϵ = convert(T, epsilon)
	α = isnothing(alpha) ? one(T) : alpha
	β = isnothing(beta) ? zero(T) : beta
	# This makes an array of tuples, sum_sum2 == tuple.(sum(x; dims), sum(abs2, x; dims))
	sum_sum2 = mapreduce(_x_x2, .+, x; dims=1, init = (zero(T), zero(T)))
	# And this calls sqrt on all N^2 elements
	return fma.(α, _normalize.(convert(T, 1//N), ϵ, x, sum_sum2), β)
	end

	_fma2(dy::T, dya::NTuple{2, T}, n::T, inN::T, is::T) where T = fma(fma(n, last(dya), first(dya)), inN, dy) * is
	function Δlayer_norm_dx(Ȳ, ϵ, α, n, x, sum_sum2)
	T = eltype(x)
	N = size(x, 1)
	is = Broadcast.instantiate(Broadcast.broadcasted(_rstd, convert(T, 1//N), ϵ, sum_sum2))
	dy = Broadcast.instantiate(Broadcast.broadcasted(*, Ȳ, α))
	# This mapreduce(f, op, a, b) will be slow on CPU, map then reduce
	dya = mapreduce(_x_y2, .+, dy, n; dims=1, init=(zero(T), zero(T)))
	∂x = _fma2.(dy, dya, n, -convert(T, 1//N), is)
	return ∂x
	end

	_taildims(Ȳ) = Base.tail(ntuple(identity, Val(ndims(Ȳ))))

	function ChainRulesCore.rrule(::typeof(layer_norm), alpha, beta, x)
	y, pullback = rrule(layer_norm, 1e-5, alpha, beta, x)
	layer_norm_pullback(Ȳ) = (NoTangent(), last_n(pullback(Ȳ), static(3))...)
	return y, layer_norm_pullback
	end

	function ChainRulesCore.rrule(::typeof(layer_norm), epsilon, alpha, beta, x)
	T = eltype(x)
	N = size(x, 1)
	ϵ = convert(T, epsilon)
	cα = static(isnothing(alpha))
	cβ = static(isnothing(beta))
	aα = static(alpha isa AbstractArray)
	aβ = static(beta isa AbstractArray)
	α = as_bool(cα) ? one(T) : alpha
	β = as_bool(cβ) ? zero(T) : beta
	sum_sum2 = mapreduce(_x_x2, .+, x; dims=1, init = (zero(T), zero(T)))
	n = _normalize.(convert(T, 1//N), ϵ, x, sum_sum2)
	y = fma.(α, n, β)
	function layer_norm_pullback(Ybar)
	Ȳ = unthunk(Ybar)
	∂α = as_bool(cα) ? NoTangent() : @thunk sum(
	Broadcast.instantiate(Broadcast.broadcasted(*, Ȳ, n));
	dims = as_bool(aα) ? _taildims(Ȳ) : :, init = zero(eltype(Ȳ))
	)
	∂β = as_bool(cβ) ? NoTangent() : @thunk sum(Ȳ; dims = as_bool(aβ) ? _taildims(Ȳ) : :)
	∂x = @thunk Δlayer_norm_dx(Ȳ, ϵ, α, n, x, sum_sum2)
	return (NoTangent(), NoTangent(), ∂α, ∂β, ∂x)
	end
	return y, layer_norm_pullback
	end

	using Static
	as_bool(b::Bool) = b
	as_bool(b::StaticBool) = Bool(b)

	function last_n(s::Tuple, n)
	offset = static(length(s)) - n
	ntuple(i->s[offset + i], n)
	end


	##### Testing forwards

	x = rand(Int8, 4, 5).+0.0
	normal_now(x)
	normal_new(x)
	normal_newer(x)

	layernorm(x, Val(1)) # so I made Val(1) the default
	layer_norm(true, false, x)
	layer_norm(nothing, nothing, x) # also encodes this


	julia> @btime normal_now($(rand(100, 100))); # most of the time in sqrt!
	min 14.541 μs, mean 19.148 μs (10 allocations, 80.45 KiB)
	min 12.875 μs, mean 16.387 μs (10 allocations, 80.45 KiB) # with @fastmath sqrt

	julia> @btime normal_new($(rand(100, 100))); # much quicker, same memory
	min 5.048 μs, mean 8.829 μs (11 allocations, 81.33 KiB)
	min 5.056 μs, mean 9.781 μs (11 allocations, 81.33 KiB) # with @fastmath sqrt

	julia> @btime normal_newer($(rand(100, 100)));
	min 5.076 μs, mean 11.021 μs (11 allocations, 81.33 KiB)

	julia> @btime layernorm($(rand(100, 100)));
	min 14.500 μs, mean 18.573 μs (18 allocations, 80.73 KiB)

	julia> @btime layer_norm(true, false, $(rand(100, 100)));
	min 16.834 μs, mean 20.344 μs (3 allocations, 79.94 KiB) # as above
	min 6.917 μs, mean 10.702 μs (3 allocations, 79.94 KiB) # with @fastmath max & sqrt


	#=
	# Components, to see where the time is...

	@btime copy($(rand(100, 100))); # 78.17 KiB

	@btime sum($(rand(100, 100)); dims=1);
	m1 = @btime mean($(rand(100, 100)); dims=1);

	@btime var($(rand(100, 100)); dims=1, mean=$m1, corrected=false); # does not allocate a copy
	@btime std($(rand(100, 100)); dims=1, mean=$m1, corrected=false);

	@btime sqrt.($m1);
	@btime inv.(sqrt.($m1));
	@btime inv.(sqrt.($(rand(100, 100))));

	=#

	##### Testing gradient
	# Can save a lot of memory, but not much speedup, at least on M1 mac.

	using Zygote, BenchmarkTools

	Zygote.gradient(x -> sum(normal_now(x)[1,:]), x)[1]
	Zygote.gradient(x -> sum(normal_new(x)[1,:]), x)[1]
	Zygote.gradient(x -> sum(normal_newer(x)[1,:]), x)[1]

	Zygote.gradient(x -> sum(layernorm(x)[1,:]), x)[1] # this does not look the same
	Zygote.gradient(x -> sum(layer_norm(true, false, x)[1,:]), x)[1]


	julia> @btime Zygote.gradient(x -> sum(abs2, x), $(rand(100, 100))); # baseline, no norm!
	min 3.391 μs, mean 11.354 μs (2 allocations, 78.17 KiB)

	julia> @btime Zygote.gradient(x -> sum(abs2, normal_now(x)), $(rand(100, 100)));
	min 26.334 μs, mean 85.340 μs (57 allocations, 637.97 KiB)

	julia> @btime Zygote.gradient(x -> sum(abs2, normal_new(x)), $(rand(100, 100)));
	min 26.750 μs, mean 48.510 μs (33 allocations, 240.11 KiB)

	julia> @btime Zygote.gradient(x -> sum(abs2, normal_newer(x)), $(rand(100, 100)));
	min 26.959 μs, mean 53.320 μs (33 allocations, 240.11 KiB)

	julia> (637.97-78.17) / (240.11-78.17)
	3.4568358651352358

	julia> @btime Zygote.gradient(x -> sum(abs2, layernorm(x)), $(rand(100, 100)));
	min 81.666 μs, mean 130.307 μs (252 allocations, 570.58 KiB)

	julia> @btime Zygote.gradient(x -> sum(abs2, layer_norm(true, false, x)), $(rand(100, 100)));
	min 77.500 μs, mean 125.485 μs (37 allocations, 473.62 KiB)


	##### GPU times
	# The conclusion here is that NVIDIA did this optimisation better than I did.
	# No point optimising GPU case; can LayerNorm call BatchNorm's backend too?

	using CUDA, Flux

	julia> cx = cu(randn(100, 1000));

	julia> CUDA.@time cx .+ 1; # baseline
	0.000124 seconds (40 CPU allocations: 1.750 KiB) (1 GPU allocation: 390.625 KiB, 16.67% memmgmt time)

	julia> CUDA.@time Flux.normalise(cx);
	0.000431 seconds (441 CPU allocations: 21.453 KiB) (9 GPU allocations: 784.766 KiB, 11.47% memmgmt time)

	julia> CUDA.@time normal_now(cx);
	0.000292 seconds (258 CPU allocations: 11.516 KiB) (6 GPU allocations: 796.875 KiB, 13.49% memmgmt time)

	julia> CUDA.@time normal_new(cx);
	0.000307 seconds (296 CPU allocations: 13.203 KiB) (7 GPU allocations: 800.781 KiB, 13.97% memmgmt time)


	# gradients

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, x), cx); # baseline
	0.060339 seconds (16.40 k CPU allocations: 963.312 KiB) (6 GPU allocations: 1.908 MiB, 0.13% memmgmt time)

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, Flux.normalise(x)), cx);
	0.156524 seconds (38.36 k CPU allocations: 2.249 MiB) (29 GPU allocations: 5.348 MiB, 0.60% memmgmt time)

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, normal_now(x)), cx);
	0.079000 seconds (23.59 k CPU allocations: 1.313 MiB) (28 GPU allocations: 5.387 MiB, 0.21% memmgmt time)

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, normal_new(x)), cx);
	0.074505 seconds (24.45 k CPU allocations: 1.396 MiB) (16 GPU allocations: 3.079 MiB, 0.15% memmgmt time)


	# batchnorm
	# bn2 = BatchNorm(100, affine=false) \|> gpu # gives an error!
	# But what does work, is more efficient than mine: 2.672 MiB < 3.079 MiB

	julia> bn1 = BatchNorm(100) \|> gpu
	BatchNorm(100) # 200 parameters, plus 200 non-trainable

	julia> CUDA.@time bn1(cx);
	0.000291 seconds (42 CPU allocations: 1.312 KiB) (1 GPU allocation: 390.625 KiB, 7.79% memmgmt time)

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, bn1(x)), cx);
	0.099942 seconds (36.84 k CPU allocations: 2.064 MiB) (10 GPU allocations: 2.672 MiB, 0.13% memmgmt time)


	# layernorm
	# here it is literally calling Flux.normalise
	# When the dims work, can it call BatchNorm instead?

	julia> ln1 = LayerNorm(100) \|> gpu
	LayerNorm(100) # 200 parameters

	julia> ln2 = LayerNorm(100, affine=false) \|> gpu
	LayerNorm(100) # 200 parameters

	julia> CUDA.@time ln2(cx);
	0.000369 seconds (320 CPU allocations: 15.688 KiB) (7 GPU allocations: 800.781 KiB, 16.56% memmgmt time)

	julia> CUDA.@time Zygote.gradient(x -> sum(sin, ln2(x)), cx);
	0.136035 seconds (37.73 k CPU allocations: 2.104 MiB, 23.66% gc time) (26 GPU allocations: 5.376 MiB, 0.19% memmgmt time)


	##### GPU, January
	# Now updated to include https://github.com/JuliaGPU/GPUArrays.jl/pull/443

	julia> let x = CUDA.randn(100, 1000) # Forward pass, @btime
	@btime CUDA.@sync copy($x) # baseline
	println()
	@btime CUDA.@sync normal_now($x)
	@btime CUDA.@sync normal_new($x)
	@btime CUDA.@sync normal_newer($x)
	println()
	@btime CUDA.@sync layernorm($x)
	@btime CUDA.@sync layer_norm(nothing, nothing, $x)
	println()
	μ = @btime CUDA.@sync mean($x; dims=1)
	@btime CUDA.@sync var($x; mean=$μ, corrected=false, dims=1)
	end;
	17.749 μs (13 allocations: 400 bytes)

	96.518 μs (219 allocations: 10.88 KiB)
	106.690 μs (257 allocations: 12.56 KiB)
	56.772 μs (144 allocations: 9.45 KiB)

	93.800 μs (225 allocations: 11.41 KiB)
	43.795 μs (93 allocations: 4.33 KiB)

	43.810 μs (99 allocations: 4.59 KiB)
	41.918 μs (92 allocations: 5.17 KiB)

	julia> let x = CUDA.randn(1000, 10_000) # Forward pass, alloc
	CUDA.@time copy(x) # baseline
	println()
	CUDA.@time normal_now(x)
	CUDA.@time normal_new(x) # surprisingly large alloc, why? fixed.
	CUDA.@time normal_newer(x)
	println()
	CUDA.@time layernorm(x)
	CUDA.@time layer_norm(nothing, nothing, x)
	println()
	μ = CUDA.@time mean(x; dims=1)
	CUDA.@time var(x; mean=μ, corrected=false, dims=1) # allocates a big array... before 443
	end;
	0.000192 seconds (13 CPU allocations: 400 bytes) (1 GPU allocation: 38.147 MiB, 13.82% memmgmt time)

	0.001525 seconds (279 CPU allocations: 14.281 KiB) (4 GPU allocations: 38.261 MiB, 1.92% memmgmt time)
	0.000904 seconds (315 CPU allocations: 15.859 KiB) (5 GPU allocations: 38.300 MiB, 3.44% memmgmt time)
	0.006859 seconds (216 CPU allocations: 13.625 KiB) (2 GPU allocations: 38.261 MiB, 0.36% memmgmt time)

	0.009269 seconds (365 CPU allocations: 19.094 KiB) (4 GPU allocations: 38.261 MiB, 0.38% memmgmt time)
	0.005154 seconds (110 CPU allocations: 5.188 KiB) (2 GPU allocations: 38.223 MiB, 0.44% memmgmt time)

	0.000320 seconds (101 CPU allocations: 4.625 KiB) (2 GPU allocations: 78.125 KiB, 3.75% memmgmt time)
	0.015003 seconds (156 CPU allocations: 8.453 KiB) (1 GPU allocation: 39.062 KiB, 0.09% memmgmt time)

	julia> let x = CUDA.randn(100, 1000) # Gradient, @btime
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, x), $x) # baseline, no norm!
	println()
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_now(x)), $x)
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_new(x)), $x)
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, normal_newer(x)), $x)
	println()
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, layernorm(x, Val(1))), $x)
	@btime CUDA.@sync Zygote.gradient(x -> sum(abs2, layer_norm(nothing, nothing, x)), $x)
	end
	176.829 μs (325 allocations: 14.14 KiB)

	685.050 μs (1250 allocations: 60.31 KiB)
	413.809 μs (789 allocations: 42.55 KiB)
	373.748 μs (666 allocations: 39.12 KiB)

	993.590 μs (1494 allocations: 74.17 KiB)
	365.185 μs (565 allocations: 25.61 KiB)

	julia> let x = CUDA.randn(1000, 10_000) # Gradient, alloc
	CUDA.@time Zygote.gradient(x -> sum(abs2, x), x) # baseline, no norm!
	println()
	CUDA.@time Zygote.gradient(x -> sum(abs2, normal_now(x)), x)
	CUDA.@time Zygote.gradient(x -> sum(abs2, normal_new(x)), x)
	CUDA.@time Zygote.gradient(x -> sum(abs2, normal_newer(x)), x)
	println()
	CUDA.@time Zygote.gradient(x -> sum(abs2, layernorm(x, Val(1))), x)
	CUDA.@time Zygote.gradient(x -> sum(abs2, layer_norm(nothing, nothing, x)), x)
	end;
	0.055923 seconds (16.29 k CPU allocations: 959.204 KiB) (6 GPU allocations: 190.735 MiB, 0.09% memmgmt time)

	0.079773 seconds (23.45 k CPU allocations: 1.308 MiB) (26 GPU allocations: 458.222 MiB, 0.18% memmgmt time)
	0.076155 seconds (24.29 k CPU allocations: 1.388 MiB) (14 GPU allocations: 267.258 MiB, 0.12% memmgmt time)
	0.079763 seconds (24.18 k CPU allocations: 1.392 MiB) (11 GPU allocations: 267.220 MiB, 0.11% memmgmt time)

	0.106857 seconds (30.34 k CPU allocations: 1.698 MiB) (27 GPU allocations: 496.369 MiB, 0.15% memmgmt time)
	0.076044 seconds (24.26 k CPU allocations: 1.388 MiB) (11 GPU allocations: 305.329 MiB, 0.11% memmgmt time)