patrick-kidger/jax-learning-matrix-exponential.py Secret

## jax-learning-matrix-exponential.py
# JAX script adapted from https://news.ycombinator.com/item?id=31029699
# The main two modifications were to switch out `jax.value_and_grad` -> `jax.grad`,
# and to include the model update inside the JIT'd region.

import equinox as eqx
import jax
import jax.numpy as jnp
import jax.random as jr
import optax
import time

class MatrixExponentEstimator(eqx.Module):
    d0: eqx.nn.Linear
    d1: eqx.nn.Linear
    d2: eqx.nn.Linear

    def __init__(self, key):
        key1, key2, key3 = jr.split(key, 3)
        self.d0 = eqx.nn.Linear(4, 32, key=key1)
        self.d1 = eqx.nn.Linear(32, 16, key=key2)
        self.d2 = eqx.nn.Linear(16, 4, key=key3)

    def __call__(self, x):
        x = jax.numpy.tanh(self.d0(x))
        x = jax.numpy.tanh(self.d1(x))
        return self.d2(x)

def f(x):
    return jax.scipy.linalg.expm(x.reshape((2,2))).reshape((4,))

def apply_matrix_exponential(x):
    return jax.numpy.apply_along_axis(f, 1, x)

def train():
    epochs = 10000
    key = jr.PRNGKey(1337)
    trainkey, testkey, modelkey = jr.split(key, 3)
    trainx = jr.normal(trainkey, shape=(10000, 2*2))
    trainy = apply_matrix_exponential(trainx)
    testx = jr.normal(testkey, shape=(10000, 2*2))
    testy = apply_matrix_exponential(testx)

    model = MatrixExponentEstimator(modelkey)
    adam = optax.adam(1e-3)
    opt_state = adam.init(model)

    def loss_fn(model, X, y):
        err = jax.vmap(model)(X) - y
        return jnp.mean(jnp.square(err))  # mse

    @jax.jit
    def make_step(model, X, y, opt_state):
        grads = jax.grad(loss_fn)(model, X, y)
        updates, opt_state = adam.update(grads, opt_state)
        model = eqx.apply_updates(model, updates)
        return model, opt_state

    print('Initial Train Loss: {:.4f}'.format(loss_fn(model, trainx, trainy).item()))
    print('Initial Test Loss: {:.4f}'.format(loss_fn(model, testx, testy).item()))
    for _ in range(3):
        t_start = time.time()
        for _ in range(epochs):
            model, opt_state = make_step(model, trainx, trainy, opt_state)
        print('Took: {:.2f} seconds'.format(time.time() - t_start))
        print('Train Loss: {:.4f}'.format(loss_fn(model, trainx, trainy).item()))
        print('Test Loss: {:.4f}'.format(loss_fn(model, testx, testy).item()))

if __name__ == '__main__':
    train()

# Output:
#
# Initial Train Loss: 6.4230
# Initial Test Loss: 6.1087
# Took: 15.61 seconds
# Train Loss: 0.0310
# Test Loss: 0.0273
# Took: 16.21 seconds
# Train Loss: 0.0035
# Test Loss: 0.0156
# Took: 16.80 seconds
# Train Loss: 0.0018
# Test Loss: 0.0111

# In comparison when running
# https://julialang.org/blog/2022/04/simple-chains/#simplechainsjl_in_action_30x-ing_pytorch_in_tiny_example
# (And switching out `G = SimpleChains.alloc_threaded_grad(mlpd)` for `G = similar(p)` to avoid
#  `UndefVarError: alloc_threaded_grad not defined`.)
# I get:
#
# julia> report(p)
#
# ┌ Info: Loss:
# │   train = 113290.805f0
# └   test = 109008.77f0
#
# julia> for _ in 1:3
#          @time SimpleChains.train_unbatched!(
#            G, p, mlpdloss, X, SimpleChains.ADAM(), 10_000
#          );
#          report(p)
#        end
#  19.304949 seconds (10.21 M allocations: 536.257 MiB, 0.85% gc time, 20.75% compilation time)
# ┌ Info: Loss:
# │   train = 143.67769f0
# └   test = 1110.0271f0
#  15.198186 seconds
# ┌ Info: Loss:
# │   train = 40.200905f0
# └   test = 733.7304f0
#  15.536973 seconds
# ┌ Info: Loss:
# │   train = 32.163437f0
# └   test = 639.05725f0
#
# Which produces timings that are (a) very similar to those obtained for JAX, but
# (b) noticeably very different those results reported by Chris in the HN thread.
#
# In addition, the loss results for the Julia script are really really bad. I don't know what's going
# on with that.
	# JAX script adapted from https://news.ycombinator.com/item?id=31029699
	# The main two modifications were to switch out `jax.value_and_grad` -> `jax.grad`,
	# and to include the model update inside the JIT'd region.

	import equinox as eqx
	import jax
	import jax.numpy as jnp
	import jax.random as jr
	import optax
	import time

	class MatrixExponentEstimator(eqx.Module):
	d0: eqx.nn.Linear
	d1: eqx.nn.Linear
	d2: eqx.nn.Linear

	def __init__(self, key):
	key1, key2, key3 = jr.split(key, 3)
	self.d0 = eqx.nn.Linear(4, 32, key=key1)
	self.d1 = eqx.nn.Linear(32, 16, key=key2)
	self.d2 = eqx.nn.Linear(16, 4, key=key3)

	def __call__(self, x):
	x = jax.numpy.tanh(self.d0(x))
	x = jax.numpy.tanh(self.d1(x))
	return self.d2(x)

	def f(x):
	return jax.scipy.linalg.expm(x.reshape((2,2))).reshape((4,))

	def apply_matrix_exponential(x):
	return jax.numpy.apply_along_axis(f, 1, x)

	def train():
	epochs = 10000
	key = jr.PRNGKey(1337)
	trainkey, testkey, modelkey = jr.split(key, 3)
	trainx = jr.normal(trainkey, shape=(10000, 2*2))
	trainy = apply_matrix_exponential(trainx)
	testx = jr.normal(testkey, shape=(10000, 2*2))
	testy = apply_matrix_exponential(testx)

	model = MatrixExponentEstimator(modelkey)
	adam = optax.adam(1e-3)
	opt_state = adam.init(model)

	def loss_fn(model, X, y):
	err = jax.vmap(model)(X) - y
	return jnp.mean(jnp.square(err)) # mse

	@jax.jit
	def make_step(model, X, y, opt_state):
	grads = jax.grad(loss_fn)(model, X, y)
	updates, opt_state = adam.update(grads, opt_state)
	model = eqx.apply_updates(model, updates)
	return model, opt_state

	print('Initial Train Loss: {:.4f}'.format(loss_fn(model, trainx, trainy).item()))
	print('Initial Test Loss: {:.4f}'.format(loss_fn(model, testx, testy).item()))
	for _ in range(3):
	t_start = time.time()
	for _ in range(epochs):
	model, opt_state = make_step(model, trainx, trainy, opt_state)
	print('Took: {:.2f} seconds'.format(time.time() - t_start))
	print('Train Loss: {:.4f}'.format(loss_fn(model, trainx, trainy).item()))
	print('Test Loss: {:.4f}'.format(loss_fn(model, testx, testy).item()))

	if __name__ == '__main__':
	train()

	# Output:
	#
	# Initial Train Loss: 6.4230
	# Initial Test Loss: 6.1087
	# Took: 15.61 seconds
	# Train Loss: 0.0310
	# Test Loss: 0.0273
	# Took: 16.21 seconds
	# Train Loss: 0.0035
	# Test Loss: 0.0156
	# Took: 16.80 seconds
	# Train Loss: 0.0018
	# Test Loss: 0.0111

	# In comparison when running
	# https://julialang.org/blog/2022/04/simple-chains/#simplechainsjl_in_action_30x-ing_pytorch_in_tiny_example
	# (And switching out `G = SimpleChains.alloc_threaded_grad(mlpd)` for `G = similar(p)` to avoid
	# `UndefVarError: alloc_threaded_grad not defined`.)
	# I get:
	#
	# julia> report(p)
	#
	# ┌ Info: Loss:
	# │ train = 113290.805f0
	# └ test = 109008.77f0
	#
	# julia> for _ in 1:3
	# @time SimpleChains.train_unbatched!(
	# G, p, mlpdloss, X, SimpleChains.ADAM(), 10_000
	# );
	# report(p)
	# end
	# 19.304949 seconds (10.21 M allocations: 536.257 MiB, 0.85% gc time, 20.75% compilation time)
	# ┌ Info: Loss:
	# │ train = 143.67769f0
	# └ test = 1110.0271f0
	# 15.198186 seconds
	# ┌ Info: Loss:
	# │ train = 40.200905f0
	# └ test = 733.7304f0
	# 15.536973 seconds
	# ┌ Info: Loss:
	# │ train = 32.163437f0
	# └ test = 639.05725f0
	#
	# Which produces timings that are (a) very similar to those obtained for JAX, but
	# (b) noticeably very different those results reported by Chris in the HN thread.
	#
	# In addition, the loss results for the Julia script are really really bad. I don't know what's going
	# on with that.