maierfelix/gist:dbee9abe2fa77520228ad2a7b596904c Secret

## gistfile1.txt
C:\Users\User\Documents\GitHub\momo>nvprof node C:\Users\User\Documents\GitHub\momo\bundle.js
(nvk) Using Vulkan v1.1.126
(nvk) Validation checks are enabled
==12800== NVPROF is profiling process 12800, command: node C:\Users\User\Documents\GitHub\momo\bundle.js
==12800== Warning: Unified Memory Profiling is not supported on the current configuration because a pair of devices without peer-to-peer support is detected on this multi-GPU setup. When peer mappings are not available, system falls back to using zero-copy memory. It can cause kernels, which access unified memory, to run slower. More details can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-managed-memory
[ 4][       KNOBS]: All knobs on default.

[ 4][  DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db"
[ 4][  DISK CACHE]:     Cache data size: "55.8 KiB"
[ 4][    DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010
[ 4][    DENOISER]: layers created for resolution 1280 720, inp 8, outp 3
Excluding validation layer 'VK_LAYER_LUNARG_core_validation' since it is not available
[ 4][       KNOBS]: All knobs on default.

[ 4][  DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db"
[ 4][  DISK CACHE]:     Cache data size: "55.8 KiB"
[ 4][    DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010
[ 4][    DENOISER]: layers created for resolution 1280 720, inp 8, outp 3
==12800== Profiling application: node C:\Users\User\Documents\GitHub\momo\bundle.js
==12800== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   32.39%  1.17506s      6831  172.02us  29.152us  637.24us  turing_h1688cudnn_128x128_ldg8_relu_exp_small_nhwc_tn_v1
                   30.74%  1.11522s      2484  448.96us  27.328us  1.0936ms  turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_small_nhwc_tn_v1
                   10.75%  389.95ms      3105  125.59us  6.1760us  469.40us  void optix_exp::k_Scale2xConcat_NHWC<__half>(__half const *, __half const , optix_exp::k_Scale2xConcat_NHWC<__half>*, int, int, int, int)
                    6.08%  220.75ms      1863  118.49us  114.02us  122.05us  void optix_exp::k_SetChannelRGB<__half>(__half*, optix_exp::floatAccess, int, int, int, int, int, int, float, float*)
                    4.40%  159.78ms       621  257.30us  251.55us  341.34us  void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=32, int=64, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams)
                    4.16%  150.92ms       621  243.03us  237.31us  323.74us  void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=64, int=32, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams)
                    3.37%  122.31ms      3105  39.390us  3.7120us  129.50us  optix_exp::k_MaxPooling_NHWC(__half const *, __half*, int, int, int, int, int)
                    2.25%  81.571ms       621  131.35us  128.93us  170.88us  void optix_exp::k_SpaceToDepth_NHWC<__half>(__half*, optix_exp::k_SpaceToDepth_NHWC<__half> const *, int, int, int, int)
                    1.59%  57.826ms       621  93.117us  91.423us  104.80us  void optix_exp::k_copyOutputRGB<float, __half>(OptixImage2D, optix_exp::k_copyOutputRGB<float, __half>, __half*, int, int, int, int, int, int, unsigned int, unsigned int, int, float, float*, float)
                    1.39%  50.592ms      9315  5.4310us  2.5600us  22.752us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                    0.93%  33.798ms       621  54.424us  54.079us  64.768us  void optix_exp::k_DepthToSpace_NHWC<__half>(__half*, optix_exp::k_DepthToSpace_NHWC<__half> const *, int, int, int, int)
                    0.86%  31.128ms       621  50.125us  49.728us  54.752us  void optix_exp::k_autoexposure<float>(float*, float const *, int, int, int, int*)
                    0.35%  12.787ms       621  20.591us  19.776us  24.032us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::ReduceByKeyAgent<thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, thrust::equal_to<int>, thrust::plus<float>, int*, int>, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, int*, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, thrust::equal_to<int>, thrust::plus<float>, int, int>(thrust::use_default, thrust::use_default, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, float, thrust::device_ptr<float>, int, thrust::discard_iterator<int>, thrust::device_ptr<float>, int, thrust::equal_to<int>)
                    0.22%  7.9204ms        68  116.48us  3.0080us  755.13us  [CUDA memcpy HtoD]
                    0.22%  7.9190ms      1939  4.0840us  2.8480us  18.368us  [CUDA memset]
                    0.10%  3.5985ms       621  5.7940us  2.6560us  9.6000us  optix_exp::k_autoexposure_result(float*, int*, float*, int, int)
                    0.07%  2.5568ms       621  4.1170us  3.1680us  14.752us  [CUDA memcpy DtoH]
                    0.06%  2.1584ms       621  3.4750us  2.6880us  16.064us  [CUDA memcpy DtoD]
                    0.06%  2.1198ms       621  3.4130us  2.4960us  3.7760us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::InitAgent<thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int*>, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int*>(int, bool=1, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>)
                    0.01%  409.82us        34  12.053us  4.3520us  30.688us  void optix_exp::nchwToNhwcKernel<__half, __half>(int, int, int, int, int, __half const *, __half*)
      API calls:   92.59%  13.4374s      1863  7.2128ms  4.2000us  32.862ms  cudaStreamSynchronize
                    2.97%  430.59ms        16  26.912ms     700ns  430.06ms  cudaStreamCreateWithFlags
                    1.58%  229.09ms       628  364.79us     700ns  216.49ms  cudaFree
                    1.14%  165.71ms     32326  5.1260us  3.1000us  105.80us  cudaLaunchKernel
                    0.80%  115.52ms       621  186.03us  72.100us  367.60us  cudaMemcpyAsync
                    0.24%  34.174ms       621  55.030us  16.100us  168.30us  cudaWaitExternalSemaphoresAsync
                    0.18%  26.694ms       635  42.037us  5.7000us  5.4956ms  cudaMalloc
                    0.15%  21.582ms      1931  11.176us  3.0000us  86.600us  cuMemsetD8Async
                    0.08%  11.930ms        68  175.44us  2.6000us  904.00us  cuMemcpyHtoDAsync
                    0.06%  8.3556ms       621  13.455us  11.300us  60.600us  cudaSignalExternalSemaphoresAsync
                    0.05%  7.0464ms     29808     236ns     100ns  47.100us  cudaGetLastError
                    0.04%  6.4629ms       621  10.407us  9.2000us  25.800us  cuMemcpyDtoDAsync
                    0.03%  4.0882ms      1242  3.2910us  2.2000us  29.600us  cudaFuncGetAttributes
                    0.01%  2.0900ms      1242  1.6820us  1.1000us  49.600us  cudaFuncSetAttribute
                    0.01%  1.4813ms      2486     595ns     200ns  20.300us  cudaGetDevice
                    0.01%  1.4337ms      2520     568ns     200ns  25.100us  cudaDeviceGetAttribute
                    0.01%  1.4299ms       400  3.5740us     100ns  273.70us  cuDeviceGetAttribute
                    0.01%  1.3048ms         2  652.40us  646.70us  658.10us  cuMemAlloc
                    0.01%  1.1238ms         4  280.95us  261.10us  303.80us  cudaExternalMemoryGetMappedBuffer
                    0.01%  1.0730ms      2000     536ns     200ns  160.30us  cuEventCreate
                    0.01%  1.0474ms       621  1.6860us  1.4000us  11.800us  cudaEventRecord
                    0.01%  742.50us         2  371.25us  16.400us  726.10us  cudaHostAlloc
                    0.00%  693.80us      2484     279ns     100ns  24.300us  cudaPeekAtLastError
                    0.00%  576.00us         8  72.000us     800ns  568.50us  cudaStreamCreateWithPriority
                    0.00%  297.60us         4  74.400us  54.300us  93.900us  cudaImportExternalMemory
                    0.00%  163.40us       354     461ns     200ns  9.6000us  cuModuleGetFunction
                    0.00%  93.400us         8  11.675us  4.4000us  33.900us  cudaMemsetAsync
                    0.00%  41.200us         4  10.300us  4.9000us  17.300us  cuDeviceTotalMem
                    0.00%  21.000us         2  10.500us  8.2000us  12.800us  cudaStreamCreate
                    0.00%  15.500us         2  7.7500us  2.9000us  12.600us  cudaImportExternalSemaphore
                    0.00%  15.300us        24     637ns     400ns  2.9000us  cudaEventCreateWithFlags
                    0.00%  7.3000us         1  7.3000us  7.3000us  7.3000us  cuDeviceGetPCIBusId
                    0.00%  6.1000us         8     762ns     200ns  1.6000us  cuCtxGetCurrent
                    0.00%  4.8000us         2  2.4000us  1.2000us  3.6000us  cudaHostGetDevicePointer
                    0.00%  3.6000us         6     600ns     400ns     800ns  cuDeviceGetName
                    0.00%  3.3000us         2  1.6500us     900ns  2.4000us  cudaDeviceGetStreamPriorityRange
                    0.00%  2.7000us         2  1.3500us  1.0000us  1.7000us  cudaGetDeviceCount
                    0.00%  2.5000us         4     625ns     300ns  1.5000us  cuDeviceGetCount
                    0.00%  2.5000us         6     416ns     100ns  1.2000us  cuDeviceGet
                    0.00%  2.4000us         1  2.4000us  2.4000us  2.4000us  cuInit
                    0.00%  2.2000us         6     366ns     200ns     700ns  cuCtxGetDevice
                    0.00%  1.0000us         2     500ns     500ns     500ns  cuDeviceComputeCapability
                    0.00%     900ns         4     225ns     200ns     300ns  cuDeviceGetUuid
                    0.00%     700ns         1     700ns     700ns     700ns  cuDriverGetVersion
                    0.00%     500ns         2     250ns     200ns     300ns  cuDeviceGetLuid
	C:\Users\User\Documents\GitHub\momo>nvprof node C:\Users\User\Documents\GitHub\momo\bundle.js
	(nvk) Using Vulkan v1.1.126
	(nvk) Validation checks are enabled
	==12800== NVPROF is profiling process 12800, command: node C:\Users\User\Documents\GitHub\momo\bundle.js
	==12800== Warning: Unified Memory Profiling is not supported on the current configuration because a pair of devices without peer-to-peer support is detected on this multi-GPU setup. When peer mappings are not available, system falls back to using zero-copy memory. It can cause kernels, which access unified memory, to run slower. More details can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-managed-memory
	[ 4][ KNOBS]: All knobs on default.

	[ 4][ DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db"
	[ 4][ DISK CACHE]: Cache data size: "55.8 KiB"
	[ 4][ DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010
	[ 4][ DENOISER]: layers created for resolution 1280 720, inp 8, outp 3
	Excluding validation layer 'VK_LAYER_LUNARG_core_validation' since it is not available
	[ 4][ KNOBS]: All knobs on default.

	[ 4][ DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db"
	[ 4][ DISK CACHE]: Cache data size: "55.8 KiB"
	[ 4][ DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010
	[ 4][ DENOISER]: layers created for resolution 1280 720, inp 8, outp 3
	==12800== Profiling application: node C:\Users\User\Documents\GitHub\momo\bundle.js
	==12800== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 32.39% 1.17506s 6831 172.02us 29.152us 637.24us turing_h1688cudnn_128x128_ldg8_relu_exp_small_nhwc_tn_v1
	30.74% 1.11522s 2484 448.96us 27.328us 1.0936ms turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_small_nhwc_tn_v1
	10.75% 389.95ms 3105 125.59us 6.1760us 469.40us void optix_exp::k_Scale2xConcat_NHWC<__half>(__half const , __half const , optix_exp::k_Scale2xConcat_NHWC<__half>, int, int, int, int)
	6.08% 220.75ms 1863 118.49us 114.02us 122.05us void optix_exp::k_SetChannelRGB<__half>(__half, optix_exp::floatAccess, int, int, int, int, int, int, float, float)
	4.40% 159.78ms 621 257.30us 251.55us 341.34us void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=32, int=64, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams)
	4.16% 150.92ms 621 243.03us 237.31us 323.74us void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=64, int=32, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams)
	3.37% 122.31ms 3105 39.390us 3.7120us 129.50us optix_exp::k_MaxPooling_NHWC(__half const , __half, int, int, int, int, int)
	2.25% 81.571ms 621 131.35us 128.93us 170.88us void optix_exp::k_SpaceToDepth_NHWC<__half>(__half, optix_exp::k_SpaceToDepth_NHWC<__half> const , int, int, int, int)
	1.59% 57.826ms 621 93.117us 91.423us 104.80us void optix_exp::k_copyOutputRGB<float, __half>(OptixImage2D, optix_exp::k_copyOutputRGB<float, __half>, __half, int, int, int, int, int, int, unsigned int, unsigned int, int, float, float, float)
	1.39% 50.592ms 9315 5.4310us 2.5600us 22.752us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
	0.93% 33.798ms 621 54.424us 54.079us 64.768us void optix_exp::k_DepthToSpace_NHWC<__half>(__half, optix_exp::k_DepthToSpace_NHWC<__half> const , int, int, int, int)
	0.86% 31.128ms 621 50.125us 49.728us 54.752us void optix_exp::k_autoexposure<float>(float, float const , int, int, int, int*)
	0.35% 12.787ms 621 20.591us 19.776us 24.032us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::ReduceByKeyAgent<thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, thrust::equal_to<int>, thrust::plus<float>, int, int>, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, int, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, thrust::equal_to<int>, thrust::plus<float>, int, int>(thrust::use_default, thrust::use_default, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, float, thrust::device_ptr<float>, int, thrust::discard_iterator<int>, thrust::device_ptr<float>, int, thrust::equal_to<int>)
	0.22% 7.9204ms 68 116.48us 3.0080us 755.13us [CUDA memcpy HtoD]
	0.22% 7.9190ms 1939 4.0840us 2.8480us 18.368us [CUDA memset]
	0.10% 3.5985ms 621 5.7940us 2.6560us 9.6000us optix_exp::k_autoexposure_result(float, int, float*, int, int)
	0.07% 2.5568ms 621 4.1170us 3.1680us 14.752us [CUDA memcpy DtoH]
	0.06% 2.1584ms 621 3.4750us 2.6880us 16.064us [CUDA memcpy DtoD]
	0.06% 2.1198ms 621 3.4130us 2.4960us 3.7760us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::InitAgent<thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int>, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int>(int, bool=1, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>)
	0.01% 409.82us 34 12.053us 4.3520us 30.688us void optix_exp::nchwToNhwcKernel<__half, __half>(int, int, int, int, int, __half const , __half)
	API calls: 92.59% 13.4374s 1863 7.2128ms 4.2000us 32.862ms cudaStreamSynchronize
	2.97% 430.59ms 16 26.912ms 700ns 430.06ms cudaStreamCreateWithFlags
	1.58% 229.09ms 628 364.79us 700ns 216.49ms cudaFree
	1.14% 165.71ms 32326 5.1260us 3.1000us 105.80us cudaLaunchKernel
	0.80% 115.52ms 621 186.03us 72.100us 367.60us cudaMemcpyAsync
	0.24% 34.174ms 621 55.030us 16.100us 168.30us cudaWaitExternalSemaphoresAsync
	0.18% 26.694ms 635 42.037us 5.7000us 5.4956ms cudaMalloc
	0.15% 21.582ms 1931 11.176us 3.0000us 86.600us cuMemsetD8Async
	0.08% 11.930ms 68 175.44us 2.6000us 904.00us cuMemcpyHtoDAsync
	0.06% 8.3556ms 621 13.455us 11.300us 60.600us cudaSignalExternalSemaphoresAsync
	0.05% 7.0464ms 29808 236ns 100ns 47.100us cudaGetLastError
	0.04% 6.4629ms 621 10.407us 9.2000us 25.800us cuMemcpyDtoDAsync
	0.03% 4.0882ms 1242 3.2910us 2.2000us 29.600us cudaFuncGetAttributes
	0.01% 2.0900ms 1242 1.6820us 1.1000us 49.600us cudaFuncSetAttribute
	0.01% 1.4813ms 2486 595ns 200ns 20.300us cudaGetDevice
	0.01% 1.4337ms 2520 568ns 200ns 25.100us cudaDeviceGetAttribute
	0.01% 1.4299ms 400 3.5740us 100ns 273.70us cuDeviceGetAttribute
	0.01% 1.3048ms 2 652.40us 646.70us 658.10us cuMemAlloc
	0.01% 1.1238ms 4 280.95us 261.10us 303.80us cudaExternalMemoryGetMappedBuffer
	0.01% 1.0730ms 2000 536ns 200ns 160.30us cuEventCreate
	0.01% 1.0474ms 621 1.6860us 1.4000us 11.800us cudaEventRecord
	0.01% 742.50us 2 371.25us 16.400us 726.10us cudaHostAlloc
	0.00% 693.80us 2484 279ns 100ns 24.300us cudaPeekAtLastError
	0.00% 576.00us 8 72.000us 800ns 568.50us cudaStreamCreateWithPriority
	0.00% 297.60us 4 74.400us 54.300us 93.900us cudaImportExternalMemory
	0.00% 163.40us 354 461ns 200ns 9.6000us cuModuleGetFunction
	0.00% 93.400us 8 11.675us 4.4000us 33.900us cudaMemsetAsync
	0.00% 41.200us 4 10.300us 4.9000us 17.300us cuDeviceTotalMem
	0.00% 21.000us 2 10.500us 8.2000us 12.800us cudaStreamCreate
	0.00% 15.500us 2 7.7500us 2.9000us 12.600us cudaImportExternalSemaphore
	0.00% 15.300us 24 637ns 400ns 2.9000us cudaEventCreateWithFlags
	0.00% 7.3000us 1 7.3000us 7.3000us 7.3000us cuDeviceGetPCIBusId
	0.00% 6.1000us 8 762ns 200ns 1.6000us cuCtxGetCurrent
	0.00% 4.8000us 2 2.4000us 1.2000us 3.6000us cudaHostGetDevicePointer
	0.00% 3.6000us 6 600ns 400ns 800ns cuDeviceGetName
	0.00% 3.3000us 2 1.6500us 900ns 2.4000us cudaDeviceGetStreamPriorityRange
	0.00% 2.7000us 2 1.3500us 1.0000us 1.7000us cudaGetDeviceCount
	0.00% 2.5000us 4 625ns 300ns 1.5000us cuDeviceGetCount
	0.00% 2.5000us 6 416ns 100ns 1.2000us cuDeviceGet
	0.00% 2.4000us 1 2.4000us 2.4000us 2.4000us cuInit
	0.00% 2.2000us 6 366ns 200ns 700ns cuCtxGetDevice
	0.00% 1.0000us 2 500ns 500ns 500ns cuDeviceComputeCapability
	0.00% 900ns 4 225ns 200ns 300ns cuDeviceGetUuid
	0.00% 700ns 1 700ns 700ns 700ns cuDriverGetVersion
	0.00% 500ns 2 250ns 200ns 300ns cuDeviceGetLuid