Created
November 29, 2019 16:21
-
-
Save maierfelix/dbee9abe2fa77520228ad2a7b596904c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\Users\User\Documents\GitHub\momo>nvprof node C:\Users\User\Documents\GitHub\momo\bundle.js | |
(nvk) Using Vulkan v1.1.126 | |
(nvk) Validation checks are enabled | |
==12800== NVPROF is profiling process 12800, command: node C:\Users\User\Documents\GitHub\momo\bundle.js | |
==12800== Warning: Unified Memory Profiling is not supported on the current configuration because a pair of devices without peer-to-peer support is detected on this multi-GPU setup. When peer mappings are not available, system falls back to using zero-copy memory. It can cause kernels, which access unified memory, to run slower. More details can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-managed-memory | |
[ 4][ KNOBS]: All knobs on default. | |
[ 4][ DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db" | |
[ 4][ DISK CACHE]: Cache data size: "55.8 KiB" | |
[ 4][ DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010 | |
[ 4][ DENOISER]: layers created for resolution 1280 720, inp 8, outp 3 | |
Excluding validation layer 'VK_LAYER_LUNARG_core_validation' since it is not available | |
[ 4][ KNOBS]: All knobs on default. | |
[ 4][ DISK CACHE]: Opened database: "C:\Users\User\AppData\Local\NVIDIA\OptixCache\cache7.db" | |
[ 4][ DISK CACHE]: Cache data size: "55.8 KiB" | |
[ 4][ DENOISER]: using cuda device "GeForce RTX 2070" (7.5), buffers: fp16, cuDNN v7500, rt v10010 | |
[ 4][ DENOISER]: layers created for resolution 1280 720, inp 8, outp 3 | |
==12800== Profiling application: node C:\Users\User\Documents\GitHub\momo\bundle.js | |
==12800== Profiling result: | |
Type Time(%) Time Calls Avg Min Max Name | |
GPU activities: 32.39% 1.17506s 6831 172.02us 29.152us 637.24us turing_h1688cudnn_128x128_ldg8_relu_exp_small_nhwc_tn_v1 | |
30.74% 1.11522s 2484 448.96us 27.328us 1.0936ms turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_small_nhwc_tn_v1 | |
10.75% 389.95ms 3105 125.59us 6.1760us 469.40us void optix_exp::k_Scale2xConcat_NHWC<__half>(__half const *, __half const , optix_exp::k_Scale2xConcat_NHWC<__half>*, int, int, int, int) | |
6.08% 220.75ms 1863 118.49us 114.02us 122.05us void optix_exp::k_SetChannelRGB<__half>(__half*, optix_exp::floatAccess, int, int, int, int, int, int, float, float*) | |
4.40% 159.78ms 621 257.30us 251.55us 341.34us void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=32, int=64, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams) | |
4.16% 150.92ms 621 243.03us 237.31us 323.74us void implicit_gemm_nhwc_h884_small_ck_RF_resident_warp_splitM<int=2, int=2, int=3, int=3, int=1, int=1, int=64, int=32, int=32, int=1, int=32, int=32, int=8, int=0>(ImplicitGemmNhwcParams) | |
3.37% 122.31ms 3105 39.390us 3.7120us 129.50us optix_exp::k_MaxPooling_NHWC(__half const *, __half*, int, int, int, int, int) | |
2.25% 81.571ms 621 131.35us 128.93us 170.88us void optix_exp::k_SpaceToDepth_NHWC<__half>(__half*, optix_exp::k_SpaceToDepth_NHWC<__half> const *, int, int, int, int) | |
1.59% 57.826ms 621 93.117us 91.423us 104.80us void optix_exp::k_copyOutputRGB<float, __half>(OptixImage2D, optix_exp::k_copyOutputRGB<float, __half>, __half*, int, int, int, int, int, int, unsigned int, unsigned int, int, float, float*, float) | |
1.39% 50.592ms 9315 5.4310us 2.5600us 22.752us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | |
0.93% 33.798ms 621 54.424us 54.079us 64.768us void optix_exp::k_DepthToSpace_NHWC<__half>(__half*, optix_exp::k_DepthToSpace_NHWC<__half> const *, int, int, int, int) | |
0.86% 31.128ms 621 50.125us 49.728us 54.752us void optix_exp::k_autoexposure<float>(float*, float const *, int, int, int, int*) | |
0.35% 12.787ms 621 20.591us 19.776us 24.032us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::ReduceByKeyAgent<thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, thrust::equal_to<int>, thrust::plus<float>, int*, int>, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, thrust::device_ptr<float>, thrust::discard_iterator<int>, thrust::device_ptr<float>, int*, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, thrust::equal_to<int>, thrust::plus<float>, int, int>(thrust::use_default, thrust::use_default, thrust::constant_iterator<int, thrust::use_default, thrust::use_default>, float, thrust::device_ptr<float>, int, thrust::discard_iterator<int>, thrust::device_ptr<float>, int, thrust::equal_to<int>) | |
0.22% 7.9204ms 68 116.48us 3.0080us 755.13us [CUDA memcpy HtoD] | |
0.22% 7.9190ms 1939 4.0840us 2.8480us 18.368us [CUDA memset] | |
0.10% 3.5985ms 621 5.7940us 2.6560us 9.6000us optix_exp::k_autoexposure_result(float*, int*, float*, int, int) | |
0.07% 2.5568ms 621 4.1170us 3.1680us 14.752us [CUDA memcpy DtoH] | |
0.06% 2.1584ms 621 3.4750us 2.6880us 16.064us [CUDA memcpy DtoD] | |
0.06% 2.1198ms 621 3.4130us 2.4960us 3.7760us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__reduce_by_key::InitAgent<thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int*>, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>, int, int*>(int, bool=1, thrust::cuda_cub::cub::ReduceByKeyScanTileState<float, int, bool=1>) | |
0.01% 409.82us 34 12.053us 4.3520us 30.688us void optix_exp::nchwToNhwcKernel<__half, __half>(int, int, int, int, int, __half const *, __half*) | |
API calls: 92.59% 13.4374s 1863 7.2128ms 4.2000us 32.862ms cudaStreamSynchronize | |
2.97% 430.59ms 16 26.912ms 700ns 430.06ms cudaStreamCreateWithFlags | |
1.58% 229.09ms 628 364.79us 700ns 216.49ms cudaFree | |
1.14% 165.71ms 32326 5.1260us 3.1000us 105.80us cudaLaunchKernel | |
0.80% 115.52ms 621 186.03us 72.100us 367.60us cudaMemcpyAsync | |
0.24% 34.174ms 621 55.030us 16.100us 168.30us cudaWaitExternalSemaphoresAsync | |
0.18% 26.694ms 635 42.037us 5.7000us 5.4956ms cudaMalloc | |
0.15% 21.582ms 1931 11.176us 3.0000us 86.600us cuMemsetD8Async | |
0.08% 11.930ms 68 175.44us 2.6000us 904.00us cuMemcpyHtoDAsync | |
0.06% 8.3556ms 621 13.455us 11.300us 60.600us cudaSignalExternalSemaphoresAsync | |
0.05% 7.0464ms 29808 236ns 100ns 47.100us cudaGetLastError | |
0.04% 6.4629ms 621 10.407us 9.2000us 25.800us cuMemcpyDtoDAsync | |
0.03% 4.0882ms 1242 3.2910us 2.2000us 29.600us cudaFuncGetAttributes | |
0.01% 2.0900ms 1242 1.6820us 1.1000us 49.600us cudaFuncSetAttribute | |
0.01% 1.4813ms 2486 595ns 200ns 20.300us cudaGetDevice | |
0.01% 1.4337ms 2520 568ns 200ns 25.100us cudaDeviceGetAttribute | |
0.01% 1.4299ms 400 3.5740us 100ns 273.70us cuDeviceGetAttribute | |
0.01% 1.3048ms 2 652.40us 646.70us 658.10us cuMemAlloc | |
0.01% 1.1238ms 4 280.95us 261.10us 303.80us cudaExternalMemoryGetMappedBuffer | |
0.01% 1.0730ms 2000 536ns 200ns 160.30us cuEventCreate | |
0.01% 1.0474ms 621 1.6860us 1.4000us 11.800us cudaEventRecord | |
0.01% 742.50us 2 371.25us 16.400us 726.10us cudaHostAlloc | |
0.00% 693.80us 2484 279ns 100ns 24.300us cudaPeekAtLastError | |
0.00% 576.00us 8 72.000us 800ns 568.50us cudaStreamCreateWithPriority | |
0.00% 297.60us 4 74.400us 54.300us 93.900us cudaImportExternalMemory | |
0.00% 163.40us 354 461ns 200ns 9.6000us cuModuleGetFunction | |
0.00% 93.400us 8 11.675us 4.4000us 33.900us cudaMemsetAsync | |
0.00% 41.200us 4 10.300us 4.9000us 17.300us cuDeviceTotalMem | |
0.00% 21.000us 2 10.500us 8.2000us 12.800us cudaStreamCreate | |
0.00% 15.500us 2 7.7500us 2.9000us 12.600us cudaImportExternalSemaphore | |
0.00% 15.300us 24 637ns 400ns 2.9000us cudaEventCreateWithFlags | |
0.00% 7.3000us 1 7.3000us 7.3000us 7.3000us cuDeviceGetPCIBusId | |
0.00% 6.1000us 8 762ns 200ns 1.6000us cuCtxGetCurrent | |
0.00% 4.8000us 2 2.4000us 1.2000us 3.6000us cudaHostGetDevicePointer | |
0.00% 3.6000us 6 600ns 400ns 800ns cuDeviceGetName | |
0.00% 3.3000us 2 1.6500us 900ns 2.4000us cudaDeviceGetStreamPriorityRange | |
0.00% 2.7000us 2 1.3500us 1.0000us 1.7000us cudaGetDeviceCount | |
0.00% 2.5000us 4 625ns 300ns 1.5000us cuDeviceGetCount | |
0.00% 2.5000us 6 416ns 100ns 1.2000us cuDeviceGet | |
0.00% 2.4000us 1 2.4000us 2.4000us 2.4000us cuInit | |
0.00% 2.2000us 6 366ns 200ns 700ns cuCtxGetDevice | |
0.00% 1.0000us 2 500ns 500ns 500ns cuDeviceComputeCapability | |
0.00% 900ns 4 225ns 200ns 300ns cuDeviceGetUuid | |
0.00% 700ns 1 700ns 700ns 700ns cuDriverGetVersion | |
0.00% 500ns 2 250ns 200ns 300ns cuDeviceGetLuid |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment