jrevels/autoencode.jl

## autoencode.jl
using BenchmarkTools
using ReverseDiff: @forward, GradientTape, gradient!, compile

# Similar to XDiff's @diff_rule, except it gets the derivative automatically via forward mode.
# In future versions, `@forward` will no longer be necessary.
@forward logistic(x::Real) = 1 / (1 + exp(-x))

# This is how I would write this for ReverseDiff usage if parser fusion didn't mess things up.
# In the future, this form will be performant (all the pieces already exist, they just have
# to be hooked up).
function autoencoder_cost(We1, We2, Wd, b1, b2, x)
    firstLayer = logistic.(We1 * x .+ b1)
    encodedInput = logistic.(We2 * firstLayer .+ b2)
    reconstructedInput = logistic.(Wd * encodedInput)
    cost = sum(reconstructedInput .- x .^ 2.0)
    return cost
end

# Same thing as above, but uglier - the only difference is that this prevents parser fusion
function autoencoder_cost_no_fuse(We1, We2, Wd, b1, b2, x)
    tmp = We1 * x .+ b1
    firstLayer = logistic.(tmp)
    tmp = We2 * firstLayer .+ b2
    encodedInput = logistic.(tmp)
    reconstructedInput = logistic.(Wd * encodedInput)
    tmp = reconstructedInput .- x
    v = 2.0
    cost = sum(tmp .^ v)
    return cost
end

We1 = rand(2000, 10_000); b1 = rand(2000); We2 = rand(1000, 2000); b2 = rand(1000);
Wd = rand(10_000, 1000); x = rand(10_000, 100);
vals = (We1, We2, Wd, b1, b2, x);
results = map(similar, vals);

f_tape = compile(GradientTape(autoencoder_cost_no_fuse, vals))

@benchmark gradient!($results, $f_tape, $vals)
	using BenchmarkTools
	using ReverseDiff: @forward, GradientTape, gradient!, compile

	# Similar to XDiff's @diff_rule, except it gets the derivative automatically via forward mode.
	# In future versions, `@forward` will no longer be necessary.
	@forward logistic(x::Real) = 1 / (1 + exp(-x))

	# This is how I would write this for ReverseDiff usage if parser fusion didn't mess things up.
	# In the future, this form will be performant (all the pieces already exist, they just have
	# to be hooked up).
	function autoencoder_cost(We1, We2, Wd, b1, b2, x)
	firstLayer = logistic.(We1 * x .+ b1)
	encodedInput = logistic.(We2 * firstLayer .+ b2)
	reconstructedInput = logistic.(Wd * encodedInput)
	cost = sum(reconstructedInput .- x .^ 2.0)
	return cost
	end

	# Same thing as above, but uglier - the only difference is that this prevents parser fusion
	function autoencoder_cost_no_fuse(We1, We2, Wd, b1, b2, x)
	tmp = We1 * x .+ b1
	firstLayer = logistic.(tmp)
	tmp = We2 * firstLayer .+ b2
	encodedInput = logistic.(tmp)
	reconstructedInput = logistic.(Wd * encodedInput)
	tmp = reconstructedInput .- x
	v = 2.0
	cost = sum(tmp .^ v)
	return cost
	end

	We1 = rand(2000, 10_000); b1 = rand(2000); We2 = rand(1000, 2000); b2 = rand(1000);
	Wd = rand(10_000, 1000); x = rand(10_000, 100);
	vals = (We1, We2, Wd, b1, b2, x);
	results = map(similar, vals);

	f_tape = compile(GradientTape(autoencoder_cost_no_fuse, vals))

	@benchmark gradient!($results, $f_tape, $vals)