Mattias Fält mfalt

## CUDA.jl
using CUDA

import CUDA.CUDNN: cudnnConvolutionForward

const W1 = cu(randn(5,5,3,6))

function inference(imgs)
    out = cudnnConvolutionForward(W1, imgs)
    return maximum(Array(out))
end

## nvidia-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   43C    P8    10W / 250W |   2256MiB / 11175MiB |      0%      Default |
|                               |                      |                  N/A |

## Error 2 example 2
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:334 [inlined]
 [2] fetch(t::Task)
   @ Base ./task.jl:349
 [3] top-level scope
   @ ~/knightvision/piece_recognition/KnightVisionServer/test/test_cuda.jl:46

    nested task error: Out of GPU memory trying to allocate 13.184 MiB

## Error with lock

const LOCK = ReentrantLock()

function inference(imgs)
    lock(LOCK)
    try
        out = nn(imgs)
        return maximum(cpu(out))
    finally
        unlock(LOCK)

## Error 2 Example 2
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:334 [inlined]
 [2] fetch(t::Task)
   @ Base ./task.jl:349
 [3] top-level scope
   @ ~/knightvision/piece_recognition/KnightVisionServer/test/test_cuda.jl:39

    nested task error: CUDNNError: CUDNN_STATUS_INTERNAL_ERROR (code 4)

## Example 1, stuck
# After execution gets stuck, sending `schedule(t, nothing, error=true)` to the task:
Stacktrace:
  [1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
    @ Base ./task.jl:777
  [2] wait()
    @ Base ./task.jl:837
  [3] wait(c::Base.GenericCondition{ReentrantLock})
    @ Base ./condition.jl:123
  [4] wait(e::Base.Event)
    @ Base ./lock.jl:366

## Error Example 2
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:334 [inlined]
 [2] fetch(t::Task)
   @ Base ./task.jl:349
 [3] top-level scope
   @ ~/knightvision/piece_recognition/KnightVisionServer/test/test_cuda.jl:39

    nested task error: CUDNNError: CUDNN_STATUS_INTERNAL_ERROR (code 4)

## Error Example 1
ERROR: LoadError: TaskFailedException

    nested task error: Out of GPU memory trying to allocate 897.217 MiB
    Effective GPU memory usage: 52.47% (5.726 GiB/10.913 GiB)
    Memory pool usage: 3.117 GiB (3.656 GiB reserved)
    Stacktrace:
      [1] macro expansion
        @ ~/.julia/packages/CUDA/0IDh2/src/pool.jl:219 [inlined]
      [2] macro expansion
        @ ./timing.jl:299 [inlined]

## Memory_problems.jl
using Flux

const nn =  gpu(Chain(
            Conv((5, 5), 3=>6, relu),
            MaxPool((2, 2)),
            Conv((5, 5), 6=>16, relu),
            MaxPool((2, 2)),
            flatten,
            Dense(256, 120, relu),
            Dense(120, 84, relu),

## gist:f3b661d993ebc2142cdbfbb895dcb804
hello
	using CUDA

	import CUDA.CUDNN: cudnnConvolutionForward

	const W1 = cu(randn(5,5,3,6))

	function inference(imgs)
	out = cudnnConvolutionForward(W1, imgs)
	return maximum(Array(out))
	end
	+-----------------------------------------------------------------------------+
	\| NVIDIA-SMI 470.103.01 Driver Version: 470.103.01 CUDA Version: 11.4 \|
	\|-------------------------------+----------------------+----------------------+
	\| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	\| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	\| \| \| MIG M. \|
	\|===============================+======================+======================\|
	\| 0 NVIDIA GeForce ... Off \| 00000000:01:00.0 Off \| N/A \|
	\| 0% 43C P8 10W / 250W \| 2256MiB / 11175MiB \| 0% Default \|
	\| \| \| N/A \|
	ERROR: LoadError: TaskFailedException
	Stacktrace:
	[1] wait
	@ ./task.jl:334 [inlined]
	[2] fetch(t::Task)
	@ Base ./task.jl:349
	[3] top-level scope
	@ ~/knightvision/piece_recognition/KnightVisionServer/test/test_cuda.jl:46

	nested task error: Out of GPU memory trying to allocate 13.184 MiB

	const LOCK = ReentrantLock()

	function inference(imgs)
	lock(LOCK)
	try
	out = nn(imgs)
	return maximum(cpu(out))
	finally
	unlock(LOCK)
	# After execution gets stuck, sending `schedule(t, nothing, error=true)` to the task:
	Stacktrace:
	[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
	@ Base ./task.jl:777
	[2] wait()
	@ Base ./task.jl:837
	[3] wait(c::Base.GenericCondition{ReentrantLock})
	@ Base ./condition.jl:123
	[4] wait(e::Base.Event)
	@ Base ./lock.jl:366
	ERROR: LoadError: TaskFailedException

	nested task error: Out of GPU memory trying to allocate 897.217 MiB
	Effective GPU memory usage: 52.47% (5.726 GiB/10.913 GiB)
	Memory pool usage: 3.117 GiB (3.656 GiB reserved)
	Stacktrace:
	[1] macro expansion
	@ ~/.julia/packages/CUDA/0IDh2/src/pool.jl:219 [inlined]
	[2] macro expansion
	@ ./timing.jl:299 [inlined]
	using Flux

	const nn = gpu(Chain(
	Conv((5, 5), 3=>6, relu),
	MaxPool((2, 2)),
	Conv((5, 5), 6=>16, relu),
	MaxPool((2, 2)),
	flatten,
	Dense(256, 120, relu),
	Dense(120, 84, relu),