Skip to content

Instantly share code, notes, and snippets.

@darsnack
Created November 26, 2020 23:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save darsnack/28289e94e267b62ca04bb990567ae37a to your computer and use it in GitHub Desktop.
Save darsnack/28289e94e267b62ca04bb990567ae37a to your computer and use it in GitHub Desktop.
GPU Allocation Snippets
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)
function loadmnist(batchsize = bs, train_split = 0.9)
# Use MLDataUtils LabelEnc for natural onehot conversion
onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
LabelEnc.NativeLabels(collect(0:9)))
# Load MNIST
imgs, labels_raw = MNIST.traindata();
# Process images into (H,W,C,BS) batches
x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
y_data = onehot(labels_raw)
(x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
p = train_split)
return (
# Use Flux's DataLoader to automatically minibatch and shuffle the data
DataLoader((x_train, Float32.(y_train)); batchsize = batchsize,
shuffle = true),
# Don't shuffle the test data
DataLoader((x_test, Float32.(y_test)); batchsize = batchsize,
shuffle = false)
)
end
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split);
down = Chain(
Conv((3,3),1=>64,relu), GroupNorm(64,64),
Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
) |> gpu
dudt = Chain(
Conv((3,3),64=>64,relu,pad=(1,1)),
Conv((3,3),64=>64,relu,pad=(1,1))
) |> gpu
fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain( # x->relu.(x),
MeanPool((6,6)),
x -> reshape(x, 64,:),
Dense(64,10)
) |> gpu
nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
save_everystep = false,
reltol = 1e-3, abstol = 1e-3,
save_start = false) |> gpu
function DiffEqArray_to_Array(x)
xarr = gpu(x)
return xarr[:,:,:,:,1]
end
model = Chain(
down, # (28,28,1,BS) -> (6,6,64,BS)
nn_ode, # (6,6,64,BS) -> (6,6,64,BS)
DiffEqArray_to_Array,
x -> reshape(x, 6,6, 64, :),
fc # (6,6,64,BS) -> (10, BS)
)
img, lab = gpu(train_dataloader.data[1][:, :, :, 1:1]), gpu(train_dataloader.data[2][:, 1:1])
x_d = down(img)
x_m = model(img)
classify(x) = argmax.(eachcol(x))
function accuracy(model, data; n_batches = 100)
total_correct = 0
total = 0
for (i, (x, y)) in enumerate(data)
# Only evaluate accuracy for n_batches
i > n_batches && break
target_class = classify(cpu(y))
predicted_class = classify(cpu(model(x)))
total_correct += sum(target_class .== predicted_class)
total += length(target_class)
end
return total_correct / total
end
loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)
# burn in loss
loss(img, lab)
# burn in accuracy
accuracy(model, CuIterator(train_dataloader))
opt = ADAM(0.001)
iter = 0
function cb()
global iter += 1
# Monitor that the weights do infact update
# Every 10 training iterations show accuracy
if iter % 10 == 1
# train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
test_accuracy = accuracy(model, CuIterator(test_dataloader);
n_batches = length(test_dataloader)) * 100
@printf("Iter: %3d || Test Accuracy: %2.3f\n",
iter, test_accuracy)
GC.gc()
CUDA.reclaim()
# CUDA.memory_status()
end
end
@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)
function loadmnist(batchsize = bs, train_split = 0.9)
# Use MLDataUtils LabelEnc for natural onehot conversion
onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
LabelEnc.NativeLabels(collect(0:9)))
# Load MNIST
imgs, labels_raw = MNIST.traindata();
# Process images into (H,W,C,BS) batches
x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
y_data = onehot(labels_raw)
(x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
p = train_split)
return (
# Use Flux's DataLoader to automatically minibatch and shuffle the data
DataLoader((x_train, Float32.(y_train)); batchsize = batchsize,
shuffle = true),
# Don't shuffle the test data
DataLoader((x_test, Float32.(y_test)); batchsize = batchsize,
shuffle = false)
)
end
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split);
down = Chain(
Conv((3,3),1=>64,relu), GroupNorm(64,64),
Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
) |> gpu
dudt = Chain(
Conv((3,3),64=>64,relu,pad=(1,1)),
Conv((3,3),64=>64,relu,pad=(1,1))
) |> gpu
fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain( # x->relu.(x),
MeanPool((6,6)),
x -> reshape(x, 64,:),
Dense(64,10)
) |> gpu
nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
save_everystep = false,
reltol = 1e-3, abstol = 1e-3,
save_start = false) |> gpu
function DiffEqArray_to_Array(x)
xarr = gpu(x)
return xarr[:,:,:,:,1]
end
model = Chain(
down, # (28,28,1,BS) -> (6,6,64,BS)
nn_ode, # (6,6,64,BS) -> (6,6,64,BS)
DiffEqArray_to_Array,
x -> reshape(x, 6,6, 64, :),
fc # (6,6,64,BS) -> (10, BS)
)
img, lab = gpu(train_dataloader.data[1][:, :, :, 1:1]), gpu(train_dataloader.data[2][:, 1:1])
x_d = down(img)
x_m = model(img)
classify(x) = argmax.(eachcol(x))
function accuracy(model, data; n_batches = 100)
total_correct = 0
total = 0
for (i, (x, y)) in enumerate(data)
# Only evaluate accuracy for n_batches
i > n_batches && break
target_class = classify(cpu(y))
predicted_class = classify(cpu(model(x)))
total_correct += sum(target_class .== predicted_class)
total += length(target_class)
end
return total_correct / total
end
loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)
# burn in loss
loss(img, lab)
# burn in accuracy
accuracy(model, CuIterator(train_dataloader))
opt = ADAM(0.001)
iter = 0
function cb()
global iter += 1
# Monitor that the weights do infact update
# Every 10 training iterations show accuracy
if iter % 10 == 1
# train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
test_accuracy = accuracy(model, CuIterator(test_dataloader);
n_batches = length(test_dataloader)) * 100
@printf("Iter: %3d || Test Accuracy: %2.3f\n",
iter, test_accuracy)
# GC.gc()
# CUDA.reclaim()
# CUDA.memory_status()
end
end
@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)
using DiffEqFlux, OrdinaryDiffEq, Flux, NNlib, Printf
using Flux: logitcrossentropy
using DataLoaders
using MLDatasets
using MLDataPattern
using LearnBase
using MLDataUtils: LabelEnc, convertlabel
using CUDA
CUDA.allowscalar(false)
# Use MLDataUtils LabelEnc for natural onehot conversion
array_wrap(x::AbstractArray) = x
array_wrap(x::Number) = [x]
onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw, collect(0:9))
# implement a MLDataPattern compatible interface
# long-term we will update MLDatasets to do this automatically
struct MNISTDataset{T, S}
imgs::T
labels::S
end
MNISTDataset() = MNISTDataset(MNIST.traindata(Float32)...)
LearnBase.nobs(d::MNISTDataset) = size(d.imgs, 3)
function LearnBase.getobs(d::MNISTDataset, idx)
imgs, labels = d.imgs[:, :, idx], d.labels[idx]
return Flux.unsqueeze(imgs, 3), Flux.squeezebatch(onehot(array_wrap(labels)))
end
# loadmnist is now returning lazy loaders
# the data is only read when the getobs call occurs
# this could be extended to only loading data from disk
function loadmnist(batchsize = bs, train_split = 0.9)
dataset = MNISTDataset() # reference our nobs and getobs
traindata, valdata = MLDataPattern.splitobs(dataset; at = train_split) # split training data
return (
# Use DataLoaders.DataLoader instead
DataLoader(shuffleobs(traindata), batchsize),
# Don't shuffle the test data
DataLoader(valdata, batchsize)
)
end
# Main
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split);
down = Chain(Conv((3,3),1=>64,relu,stride=1), GroupNorm(64,64),
Conv((4,4),64=>64,relu,stride=2,pad=1), GroupNorm(64,64),
Conv((4,4),64=>64,stride=2,pad=1)) |> gpu;
dudt = Chain(Conv((3,3),64=>64,relu,stride=1,pad=1),
Conv((3,3),64=>64,relu,stride=1,pad=1)) |> gpu;
fc = Chain(GroupNorm(64,64),
x->relu.(x),
MeanPool((6,6)),
flatten,
Dense(64,10)) |> gpu;
nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
save_everystep = false,
reltol = 1e-3, abstol = 1e-3,
save_start = false) |> gpu;
diffeqsol2arr(x) = Flux.squeezebatch(gpu(x))
# Build our over-all model topology
model = Chain(down, #(28,28,1,BS) -> (6,6,64,BS)
nn_ode, #(6,6,64,BS) -> (6,6,64,BS)
diffeqsol2arr,
fc) #(6,6,64,BS) -> (10, BS)
# To understand the intermediate NN-ODE layer, we can examine it's dimensionality
img, lab = first(train_dataloader) .|> gpu;
x_d = down(img)
# We can see that we can compute the forward pass through the NN topology
# featuring an NNODE layer.
x_m = model(img)
classify(x) = argmax.(eachcol(x))
function accuracy(model, data; n_batches = 100)
total_correct = 0
total = 0
for (i, (x, y)) in enumerate(data)
# Only evaluate accuracy for n_batches
i > n_batches && break
target_class = classify(cpu(y))
predicted_class = classify(cpu(model(x)))
total_correct += sum(target_class .== predicted_class)
total += length(target_class)
end
return total_correct / total
end
loss(x, y) = logitcrossentropy(model(x), y)
# burn in loss
# do this before accuracy
# iterating train_dataloader will clear the buffers of img, lab
loss(img, lab)
# burn in accuracy
accuracy(model, CuIterator(train_dataloader))
opt = ADAM(1e-3)
iter = 0
function cb()
global iter += 1
# Monitor that the weights do infact update
# Every 10 training iterations show accuracy
if iter % 10 == 1
train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
test_accuracy = accuracy(model, CuIterator(test_dataloader);
n_batches = nobs(test_dataloader.data)) * 100
@printf("Iter: %3d || Train Accuracy: %2.3f || Test Accuracy: %2.3f\n",
iter, train_accuracy, test_accuracy)
# GC.gc()
# CUDA.reclaim()
# CUDA.memory_status()
end
end
# Train the NN-ODE and monitor the loss and weights.
@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), CuIterator(train_dataloader), opt, cb=cb)
using DiffEqFlux, OrdinaryDiffEq, Flux, Printf
using Flux.Losses: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
using CUDA
CUDA.allowscalar(false)
function loadmnist(batchsize = bs, train_split = 0.9)
# Use MLDataUtils LabelEnc for natural onehot conversion
onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
LabelEnc.NativeLabels(collect(0:9)))
# Load MNIST
imgs, labels_raw = MNIST.traindata();
# Process images into (H,W,C,BS) batches
x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
y_data = onehot(labels_raw)
(x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
p = train_split)
return (
# Use Flux's DataLoader to automatically minibatch and shuffle the data
DataLoader(gpu.(collect.((x_train, y_train))); batchsize = batchsize,
shuffle = true),
# Don't shuffle the test data
DataLoader(gpu.(collect.((x_test, y_test))); batchsize = batchsize,
shuffle = false)
)
end
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split);
down = Chain(
Conv((3,3),1=>64,relu), GroupNorm(64,64),
Conv((4,4),64=>64,relu,stride=(2,2),pad=(1,1)), GroupNorm(64,64),
Conv((4,4),64=>64,stride=(2,2),pad=(1,1)),
) |> gpu
dudt = Chain(
Conv((3,3),64=>64,relu,pad=(1,1)),
Conv((3,3),64=>64,relu,pad=(1,1))
) |> gpu
fc = Chain(GroupNorm(64,64), x -> relu.(x),
# fc = Chain( # x->relu.(x),
MeanPool((6,6)),
x -> reshape(x, 64,:),
Dense(64,10)
) |> gpu
nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
save_everystep = false,
reltol = 1e-3, abstol = 1e-3,
save_start = false) |> gpu
function DiffEqArray_to_Array(x)
xarr = gpu(x)
return xarr[:,:,:,:,1]
end
model = Chain(
down, # (28,28,1,BS) -> (6,6,64,BS)
nn_ode, # (6,6,64,BS) -> (6,6,64,BS)
DiffEqArray_to_Array,
x -> reshape(x, 6,6, 64, :),
fc # (6,6,64,BS) -> (10, BS)
)
img, lab = train_dataloader.data[1][:, :, :, 1:1], train_dataloader.data[2][:, 1:1]
x_d = down(img)
x_m = model(img)
classify(x) = argmax.(eachcol(x))
function accuracy(model, data; n_batches = 100)
total_correct = 0
total = 0
for (i, (x, y)) in enumerate(data)
# Only evaluate accuracy for n_batches
i > n_batches && break
target_class = classify(cpu(y))
predicted_class = classify(cpu(model(x)))
total_correct += sum(target_class .== predicted_class)
total += length(target_class)
end
return total_correct / total
end
loss(x, y) = logitcrossentropy(model(x), y)
# loss(x, y) = Flux.mse(model(x), y)
# burn in loss
loss(img, lab)
# burn in accuracy
accuracy(model, train_dataloader)
opt = ADAM(0.001)
iter = 0
function cb()
global iter += 1
# Monitor that the weights do infact update
# Every 10 training iterations show accuracy
if iter % 10 == 1
# train_accuracy = accuracy(model, CuIterator(train_dataloader)) * 100
test_accuracy = accuracy(model, test_dataloader;
n_batches = length(test_dataloader)) * 100
@printf("Iter: %3d || Test Accuracy: %2.3f\n",
iter, test_accuracy)
# GC.gc()
# CUDA.reclaim()
# CUDA.memory_status()
end
end
@time Flux.train!(loss, Flux.params(down, nn_ode.p, fc), train_dataloader, opt, cb=cb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment