darsnack/snippet-1.jl

## snippet-1.jl
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils:  LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)

function loadmnist(batchsize = bs, train_split = 0.9)
    # Use MLDataUtils LabelEnc for natural onehot conversion
    onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
                                      LabelEnc.NativeLabels(collect(0:9)))
    # Load MNIST
    imgs, labels_raw = MNIST.traindata();
    # Process images into (H,W,C,BS) batches
    x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
    y_data = onehot(labels_raw)
    (x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
                                                      p = train_split)
    return (
    # Use Flux's DataLoader to automatically minibatch and shuffle the data
       DataLoader((x_train, Float32.(y_train)); batchsize = batchsize,
                 shuffle = true),
    # Don't shuffle the test data
      DataLoader((x_test, Float32.(y_test)); batchsize = batchsize,
                shuffle = false)
     )
end

const bs = 128
const train_split = 0.9

train_dataloader, test_dataloader = loadmnist(bs, train_split);

down = Chain(
            Conv((3,3),1=>64,relu), GroupNorm(64,64),
            Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
            Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
            ) |> gpu

dudt = Chain(
            Conv((3,3),64=>64,relu,pad=(1,1)),
            Conv((3,3),64=>64,relu,pad=(1,1))
            ) |> gpu

fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain(  # x->relu.(x),
            MeanPool((6,6)),
            x -> reshape(x, 64,:),
            Dense(64,10)
        ) |> gpu

nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                    save_everystep = false,
                    reltol = 1e-3, abstol = 1e-3,
                    save_start = false) |> gpu

function DiffEqArray_to_Array(x)
    xarr = gpu(x)
    return xarr[:,:,:,:,1]
end

model = Chain(
            down,             # (28,28,1,BS) -> (6,6,64,BS)
            nn_ode,           # (6,6,64,BS) -> (6,6,64,BS)
            DiffEqArray_to_Array,
            x -> reshape(x, 6,6, 64, :),
            fc                # (6,6,64,BS) -> (10, BS)
)


img, lab = gpu(train_dataloader.data[1][:, :, :, 1:1]), gpu(train_dataloader.data[2][:, 1:1])

x_d = down(img)

x_m = model(img)

classify(x) = argmax.(eachcol(x))

function accuracy(model, data; n_batches = 100)
    total_correct = 0
    total = 0
    for (i, (x, y)) in enumerate(data)
        # Only evaluate accuracy for n_batches
        i > n_batches && break
        target_class = classify(cpu(y))
        predicted_class = classify(cpu(model(x)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)

# burn in loss
loss(img, lab)

# burn in accuracy
accuracy(model, CuIterator(train_dataloader))

opt = ADAM(0.001)
iter = 0

function cb()
    global iter += 1
    # Monitor that the weights do infact update
    # Every 10 training iterations show accuracy
    if iter % 10 == 1
        # train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
        test_accuracy = accuracy(model, CuIterator(test_dataloader);
                                 n_batches = length(test_dataloader)) * 100
        @printf("Iter: %3d || Test Accuracy: %2.3f\n",
                iter, test_accuracy)

        GC.gc()
        CUDA.reclaim()
        # CUDA.memory_status()
    end
end

@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)

## snippet-2.jl
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils:  LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)

function loadmnist(batchsize = bs, train_split = 0.9)
    # Use MLDataUtils LabelEnc for natural onehot conversion
    onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
                                      LabelEnc.NativeLabels(collect(0:9)))
    # Load MNIST
    imgs, labels_raw = MNIST.traindata();
    # Process images into (H,W,C,BS) batches
    x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
    y_data = onehot(labels_raw)
    (x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
                                                      p = train_split)
    return (
    # Use Flux's DataLoader to automatically minibatch and shuffle the data
       DataLoader((x_train, Float32.(y_train)); batchsize = batchsize,
                 shuffle = true),
    # Don't shuffle the test data
      DataLoader((x_test, Float32.(y_test)); batchsize = batchsize,
                shuffle = false)
     )
end

const bs = 128
const train_split = 0.9

train_dataloader, test_dataloader = loadmnist(bs, train_split);

down = Chain(
            Conv((3,3),1=>64,relu), GroupNorm(64,64),
            Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
            Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
            ) |> gpu

dudt = Chain(
            Conv((3,3),64=>64,relu,pad=(1,1)),
            Conv((3,3),64=>64,relu,pad=(1,1))
            ) |> gpu

fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain(  # x->relu.(x),
            MeanPool((6,6)),
            x -> reshape(x, 64,:),
            Dense(64,10)
        ) |> gpu

nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                    save_everystep = false,
                    reltol = 1e-3, abstol = 1e-3,
                    save_start = false) |> gpu

function DiffEqArray_to_Array(x)
    xarr = gpu(x)
    return xarr[:,:,:,:,1]
end

model = Chain(
            down,             # (28,28,1,BS) -> (6,6,64,BS)
            nn_ode,           # (6,6,64,BS) -> (6,6,64,BS)
            DiffEqArray_to_Array,
            x -> reshape(x, 6,6, 64, :),
            fc                # (6,6,64,BS) -> (10, BS)
)


img, lab = gpu(train_dataloader.data[1][:, :, :, 1:1]), gpu(train_dataloader.data[2][:, 1:1])

x_d = down(img)

x_m = model(img)

classify(x) = argmax.(eachcol(x))

function accuracy(model, data; n_batches = 100)
    total_correct = 0
    total = 0
    for (i, (x, y)) in enumerate(data)
        # Only evaluate accuracy for n_batches
        i > n_batches && break
        target_class = classify(cpu(y))
        predicted_class = classify(cpu(model(x)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)

# burn in loss
loss(img, lab)

# burn in accuracy
accuracy(model, CuIterator(train_dataloader))

opt = ADAM(0.001)
iter = 0

function cb()
    global iter += 1
    # Monitor that the weights do infact update
    # Every 10 training iterations show accuracy
    if iter % 10 == 1
        # train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
        test_accuracy = accuracy(model, CuIterator(test_dataloader);
                                 n_batches = length(test_dataloader)) * 100
        @printf("Iter: %3d || Test Accuracy: %2.3f\n",
                iter, test_accuracy)

        # GC.gc()
        # CUDA.reclaim()
        # CUDA.memory_status()
    end
end

@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)

## snippet-3.jl
using DiffEqFlux, OrdinaryDiffEq, Flux, NNlib,  Printf
using Flux: logitcrossentropy
using DataLoaders
using MLDatasets
using MLDataPattern
using LearnBase
using MLDataUtils:  LabelEnc, convertlabel
using CUDA
CUDA.allowscalar(false)

# Use MLDataUtils LabelEnc for natural onehot conversion
array_wrap(x::AbstractArray) = x
array_wrap(x::Number) = [x]
onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw, collect(0:9))

# implement a MLDataPattern compatible interface
# long-term we will update MLDatasets to do this automatically
struct MNISTDataset{T, S}
    imgs::T
    labels::S
end
MNISTDataset() = MNISTDataset(MNIST.traindata(Float32)...)
LearnBase.nobs(d::MNISTDataset) = size(d.imgs, 3)
function LearnBase.getobs(d::MNISTDataset, idx)
    imgs, labels = d.imgs[:, :, idx], d.labels[idx]

    return Flux.unsqueeze(imgs, 3), Flux.squeezebatch(onehot(array_wrap(labels)))
end

# loadmnist is now returning lazy loaders
# the data is only read when the getobs call occurs
# this could be extended to only loading data from disk
function loadmnist(batchsize = bs, train_split = 0.9)
    dataset = MNISTDataset() # reference our nobs and getobs
    traindata, valdata = MLDataPattern.splitobs(dataset; at = train_split) # split training data

    return (
        # Use DataLoaders.DataLoader instead
        DataLoader(shuffleobs(traindata), batchsize),
        # Don't shuffle the test data
        DataLoader(valdata, batchsize)
    )
end

# Main
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split);

down = Chain(Conv((3,3),1=>64,relu,stride=1), GroupNorm(64,64),
             Conv((4,4),64=>64,relu,stride=2,pad=1), GroupNorm(64,64),
             Conv((4,4),64=>64,stride=2,pad=1)) |> gpu;
dudt = Chain(Conv((3,3),64=>64,relu,stride=1,pad=1),
             Conv((3,3),64=>64,relu,stride=1,pad=1)) |> gpu;

fc = Chain(GroupNorm(64,64),
           x->relu.(x),
           MeanPool((6,6)),
           flatten,
           Dense(64,10)) |> gpu;

nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                   save_everystep = false,
                   reltol = 1e-3, abstol = 1e-3,
                   save_start = false) |> gpu;


diffeqsol2arr(x) = Flux.squeezebatch(gpu(x))

# Build our over-all model topology
model = Chain(down,             #(28,28,1,BS) -> (6,6,64,BS)
              nn_ode,           #(6,6,64,BS) -> (6,6,64,BS)
              diffeqsol2arr,
              fc)               #(6,6,64,BS) -> (10, BS)

# To understand the intermediate NN-ODE layer, we can examine it's dimensionality
img, lab = first(train_dataloader) .|> gpu;

x_d = down(img)

# We can see that we can compute the forward pass through the NN topology
# featuring an NNODE layer.
x_m = model(img)

classify(x) = argmax.(eachcol(x))

function accuracy(model, data; n_batches = 100)
    total_correct = 0
    total = 0
    for (i, (x, y)) in enumerate(data)
        # Only evaluate accuracy for n_batches
        i > n_batches && break
        target_class = classify(cpu(y))
        predicted_class = classify(cpu(model(x)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

loss(x, y) = logitcrossentropy(model(x), y)

# burn in loss
# do this before accuracy
# iterating train_dataloader will clear the buffers of img, lab
loss(img, lab)

# burn in accuracy
accuracy(model, CuIterator(train_dataloader))

opt = ADAM(1e-3)
iter = 0

function cb()
    global iter += 1
    # Monitor that the weights do infact update
    # Every 10 training iterations show accuracy
    if iter % 10 == 1
        train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
        test_accuracy = accuracy(model, CuIterator(test_dataloader);
                                 n_batches = nobs(test_dataloader.data)) * 100
        @printf("Iter: %3d || Train Accuracy: %2.3f || Test Accuracy: %2.3f\n",
                iter, train_accuracy, test_accuracy)

        # GC.gc()
        # CUDA.reclaim()
        # CUDA.memory_status()
    end
end

# Train the NN-ODE and monitor the loss and weights.
@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)

## snippet-5.jl
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils:  LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)

function loadmnist(batchsize = bs, train_split = 0.9)
    # Use MLDataUtils LabelEnc for natural onehot conversion
    onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
                                      LabelEnc.NativeLabels(collect(0:9)))
    # Load MNIST
    imgs, labels_raw = MNIST.traindata();
    # Process images into (H,W,C,BS) batches
    x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
    y_data = onehot(labels_raw)
    (x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
                                                         p = train_split)
    return (
        # Use Flux's DataLoader to automatically minibatch and shuffle the data
        DataLoader(gpu.(collect.((x_train, y_train))); batchsize = batchsize,
                   shuffle = true),
        # Don't shuffle the test data
        DataLoader(gpu.(collect.((x_test, y_test))); batchsize = batchsize,
                   shuffle = false)
    )
end
const bs = 128
const train_split = 0.9

train_dataloader, test_dataloader = loadmnist(bs, train_split);

down = Chain(
            Conv((3,3),1=>64,relu), GroupNorm(64,64),
            Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
            Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
            ) |> gpu

dudt = Chain(
            Conv((3,3),64=>64,relu,pad=(1,1)),
            Conv((3,3),64=>64,relu,pad=(1,1))
            ) |> gpu

fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain(  # x->relu.(x),
            MeanPool((6,6)),
            x -> reshape(x, 64,:),
            Dense(64,10)
        ) |> gpu

nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                    save_everystep = false,
                    reltol = 1e-3, abstol = 1e-3,
                    save_start = false) |> gpu

function DiffEqArray_to_Array(x)
    xarr = gpu(x)
    return xarr[:,:,:,:,1]
end

model = Chain(
            down,             # (28,28,1,BS) -> (6,6,64,BS)
            nn_ode,           # (6,6,64,BS) -> (6,6,64,BS)
            DiffEqArray_to_Array,
            x -> reshape(x, 6,6, 64, :),
            fc                # (6,6,64,BS) -> (10, BS)
)


img, lab = train_dataloader.data[1][:, :, :, 1:1], train_dataloader.data[2][:, 1:1]

x_d = down(img)

x_m = model(img)

classify(x) = argmax.(eachcol(x))

function accuracy(model, data; n_batches = 100)
    total_correct = 0
    total = 0
    for (i, (x, y)) in enumerate(data)
        # Only evaluate accuracy for n_batches
        i > n_batches && break
        target_class = classify(cpu(y))
        predicted_class = classify(cpu(model(x)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end

loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)

# burn in loss
loss(img, lab)

# burn in accuracy
accuracy(model, train_dataloader)

opt = ADAM(0.001)
iter = 0

function cb()
    global iter += 1
    # Monitor that the weights do infact update
    # Every 10 training iterations show accuracy
    if iter % 10 == 1
        # train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
        test_accuracy = accuracy(model, test_dataloader;
                                 n_batches = length(test_dataloader)) * 100
        @printf("Iter: %3d || Test Accuracy: %2.3f\n",
                iter, test_accuracy)

        # GC.gc()
        # CUDA.reclaim()
        # CUDA.memory_status()
    end
end

@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), train_dataloader, opt, cb=cb)
	using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
	using Flux.Losses: logitcrossentropy
	using Flux.Data: DataLoader
	using MLDatasets
	using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
	using CUDA
	CUDA.allowscalar(false)

	function loadmnist(batchsize = bs, train_split = 0.9)
	# Use MLDataUtils LabelEnc for natural onehot conversion
	onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
	LabelEnc.NativeLabels(collect(0:9)))
	# Load MNIST
	imgs, labels_raw = MNIST.traindata();
	# Process images into (H,W,C,BS) batches
	x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
	y_data = onehot(labels_raw)
	(x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
	p = train_split)
	return (
	# Use Flux's DataLoader to automatically minibatch and shuffle the data
	DataLoader((x_train, Float32.(y_train)); batchsize = batchsize,
	shuffle = true),
	# Don't shuffle the test data
	DataLoader((x_test, Float32.(y_test)); batchsize = batchsize,
	shuffle = false)
	)
	end

	const bs = 128
	const train_split = 0.9

	train_dataloader, test_dataloader = loadmnist(bs, train_split);

	down = Chain(
	Conv((3,3),1=>64,relu), GroupNorm(64,64),
	Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
	Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
	) \|> gpu

	dudt = Chain(
	Conv((3,3),64=>64,relu,pad=(1,1)),
	Conv((3,3),64=>64,relu,pad=(1,1))
	) \|> gpu

	fc = Chain(GroupNorm(64,64), x -> relu.(x),
	# fc = Chain( # x->relu.(x),
	MeanPool((6,6)),
	x -> reshape(x, 64,:),
	Dense(64,10)
	) \|> gpu

	nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
	save_everystep = false,
	reltol = 1e-3, abstol = 1e-3,
	save_start = false) \|> gpu

	function DiffEqArray_to_Array(x)
	xarr = gpu(x)
	return xarr[:,:,:,:,1]
	end

	model = Chain(
	down, # (28,28,1,BS) -> (6,6,64,BS)
	nn_ode, # (6,6,64,BS) -> (6,6,64,BS)
	DiffEqArray_to_Array,
	x -> reshape(x, 6,6, 64, :),
	fc # (6,6,64,BS) -> (10, BS)
	)


	img, lab = gpu(train_dataloader.data[1][:, :, :, 1:1]), gpu(train_dataloader.data[2][:, 1:1])

	x_d = down(img)

	x_m = model(img)

	classify(x) = argmax.(eachcol(x))

	function accuracy(model, data; n_batches = 100)
	total_correct = 0
	total = 0
	for (i, (x, y)) in enumerate(data)
	# Only evaluate accuracy for n_batches
	i > n_batches && break
	target_class = classify(cpu(y))
	predicted_class = classify(cpu(model(x)))
	total_correct += sum(target_class .== predicted_class)
	total += length(target_class)
	end
	return total_correct / total
	end

	loss(x, y) = logitcrossentropy(model(x), y)
	# loss(x, y) = Flux.mse(model(x), y)

	# burn in loss
	loss(img, lab)

	# burn in accuracy
	accuracy(model, CuIterator(train_dataloader))

	opt = ADAM(0.001)
	iter = 0

	function cb()
	global iter += 1
	# Monitor that the weights do infact update
	# Every 10 training iterations show accuracy
	if iter % 10 == 1
	# train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
	test_accuracy = accuracy(model, CuIterator(test_dataloader);
	n_batches = length(test_dataloader)) * 100
	@printf("Iter: %3d \|\| Test Accuracy: %2.3f\n",
	iter, test_accuracy)

	GC.gc()
	CUDA.reclaim()
	# CUDA.memory_status()
	end
	end

	@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)
	using DiffEqFlux, OrdinaryDiffEq, Flux, NNlib, Printf
	using Flux: logitcrossentropy
	using DataLoaders
	using MLDatasets
	using MLDataPattern
	using LearnBase
	using MLDataUtils: LabelEnc, convertlabel
	using CUDA
	CUDA.allowscalar(false)

	# Use MLDataUtils LabelEnc for natural onehot conversion
	array_wrap(x::AbstractArray) = x
	array_wrap(x::Number) = [x]
	onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw, collect(0:9))

	# implement a MLDataPattern compatible interface
	# long-term we will update MLDatasets to do this automatically
	struct MNISTDataset{T, S}
	imgs::T
	labels::S
	end
	MNISTDataset() = MNISTDataset(MNIST.traindata(Float32)...)
	LearnBase.nobs(d::MNISTDataset) = size(d.imgs, 3)
	function LearnBase.getobs(d::MNISTDataset, idx)
	imgs, labels = d.imgs[:, :, idx], d.labels[idx]

	return Flux.unsqueeze(imgs, 3), Flux.squeezebatch(onehot(array_wrap(labels)))
	end

	# loadmnist is now returning lazy loaders
	# the data is only read when the getobs call occurs
	# this could be extended to only loading data from disk
	function loadmnist(batchsize = bs, train_split = 0.9)
	dataset = MNISTDataset() # reference our nobs and getobs
	traindata, valdata = MLDataPattern.splitobs(dataset; at = train_split) # split training data

	return (
	# Use DataLoaders.DataLoader instead
	DataLoader(shuffleobs(traindata), batchsize),
	# Don't shuffle the test data
	DataLoader(valdata, batchsize)
	)
	end

	# Main
	const bs = 128
	const train_split = 0.9
	train_dataloader, test_dataloader = loadmnist(bs, train_split);

	down = Chain(Conv((3,3),1=>64,relu,stride=1), GroupNorm(64,64),
	Conv((4,4),64=>64,relu,stride=2,pad=1), GroupNorm(64,64),
	Conv((4,4),64=>64,stride=2,pad=1)) \|> gpu;
	dudt = Chain(Conv((3,3),64=>64,relu,stride=1,pad=1),
	Conv((3,3),64=>64,relu,stride=1,pad=1)) \|> gpu;

	fc = Chain(GroupNorm(64,64),
	x->relu.(x),
	MeanPool((6,6)),
	flatten,
	Dense(64,10)) \|> gpu;

	nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
	save_everystep = false,
	reltol = 1e-3, abstol = 1e-3,
	save_start = false) \|> gpu;


	diffeqsol2arr(x) = Flux.squeezebatch(gpu(x))

	# Build our over-all model topology
	model = Chain(down, #(28,28,1,BS) -> (6,6,64,BS)
	nn_ode, #(6,6,64,BS) -> (6,6,64,BS)
	diffeqsol2arr,
	fc) #(6,6,64,BS) -> (10, BS)

	# To understand the intermediate NN-ODE layer, we can examine it's dimensionality
	img, lab = first(train_dataloader) .\|> gpu;

	x_d = down(img)

	# We can see that we can compute the forward pass through the NN topology
	# featuring an NNODE layer.
	x_m = model(img)

	classify(x) = argmax.(eachcol(x))

	function accuracy(model, data; n_batches = 100)
	total_correct = 0
	total = 0
	for (i, (x, y)) in enumerate(data)
	# Only evaluate accuracy for n_batches
	i > n_batches && break
	target_class = classify(cpu(y))
	predicted_class = classify(cpu(model(x)))
	total_correct += sum(target_class .== predicted_class)
	total += length(target_class)
	end
	return total_correct / total
	end

	loss(x, y) = logitcrossentropy(model(x), y)

	# burn in loss
	# do this before accuracy
	# iterating train_dataloader will clear the buffers of img, lab
	loss(img, lab)

	# burn in accuracy
	accuracy(model, CuIterator(train_dataloader))

	opt = ADAM(1e-3)
	iter = 0

	function cb()
	global iter += 1
	# Monitor that the weights do infact update
	# Every 10 training iterations show accuracy
	if iter % 10 == 1
	train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
	test_accuracy = accuracy(model, CuIterator(test_dataloader);
	n_batches = nobs(test_dataloader.data)) * 100
	@printf("Iter: %3d \|\| Train Accuracy: %2.3f \|\| Test Accuracy: %2.3f\n",
	iter, train_accuracy, test_accuracy)

	# GC.gc()
	# CUDA.reclaim()
	# CUDA.memory_status()
	end
	end

	# Train the NN-ODE and monitor the loss and weights.
	@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)