Skip to content

Instantly share code, notes, and snippets.

@lcw
Last active May 3, 2022 21:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lcw/1eba554a4fc75d21937e1c3d2c097e9f to your computer and use it in GitHub Desktop.
Save lcw/1eba554a4fc75d21937e1c3d2c097e9f to your computer and use it in GitHub Desktop.

ClimateMachine and Atum+Bennu for Held--Suarez

Performance

Atum+Bennu 1 day of HS

I made the following changes to profile 1 day of HS

diff --git a/experiment/euler_gravity/held_suarez_deep.jl b/experiment/euler_gravity/held_suarez_deep.jl
index bd15531..b57701a 100644
--- a/experiment/euler_gravity/held_suarez_deep.jl
+++ b/experiment/euler_gravity/held_suarez_deep.jl
@@ -247,7 +247,7 @@ yp = components(aux)[2]
 zp = components(aux)[3]
 
 # test_state .= state
-endday = 30.0 * 40
+endday = 1.0
 tmp_ρ = components(test_state)[1]
 ρ̅_start = sum(tmp_ρ .* dg_fs.MJ) / sum(dg_fs.MJ)

Running we get

==8669== Profiling application: julia-1.6 --project --check-bounds=no experiment/euler_gravity/held_suarez_deep.jl
==8669== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   18.42%  5.56585s      1512  3.6811ms  3.6601ms  3.9135ms  _Z33julia_gpu_banded_backward_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE28_gpu_banded_backward_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350ES14_S16_IS14_Li4ELi1EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE
                   17.87%  5.40116s      1512  3.5722ms  3.5380ms  3.7667ms  _Z32julia_gpu_banded_forward_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE27_gpu_banded_forward_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350ES14_S16_IS14_Li4ELi1EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE
                   16.07%  4.85761s        36  134.93ms  134.47ms  143.22ms  _Z26julia_gpu_bandedlu_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_bandedlu_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350E7Float6413CuDeviceArrayIS14_Li4ELi1EEE
                   11.62%  3.51033s      3744  937.59us  709.98us  1.5513ms  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE27LinearizedKennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi3EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILifalseEE
                    6.76%  2.04192s      4536  450.16us  439.68us  465.69us  _Z28julia_broadcast_kernel_3018715CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES10_I16StructArrayStyleI12CuArrayStyleILi2EEEvS11_S2_IS3_S10_IS14_IS15_ILi2EEEvS11_S2_IS10_IS14_IS15_ILi2EEEvS11_S2_IS3_S12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEES10_IS14_IS15_ILi2EEEvS11_S2_IS3_S12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEEEEEEEES8_
                    4.25%  1.28388s      2268  566.08us  553.28us  777.24us  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi1EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILifalseEE
                    3.63%  1.09746s      2268  483.89us  475.87us  492.70us  _Z28julia_broadcast_kernel_3000915CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEES8_
                    3.20%  967.35ms      3744  258.37us  238.94us  327.42us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi3EE21LinearizedRefanovFluxIS14_ES18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    2.71%  818.92ms      2268  361.07us  354.52us  515.96us  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi3EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILitrueEE
                    2.67%  805.65ms      2268  355.22us  348.80us  518.56us  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi2EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILitrueEE
                    2.49%  750.97ms      1553  483.56us  477.31us  665.98us  _Z28julia_broadcast_kernel_1993815CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE9_identityS2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EEEES8_
                    2.40%  725.01ms      2268  319.67us  314.97us  325.21us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi1EE7RoeFluxS18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    2.22%  671.05ms      2268  295.88us  291.68us  302.75us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi2EE7RoeFluxS18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    1.75%  529.12ms      1476  358.48us  356.45us  408.96us  _Z34julia_gpu_banded_setvector_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE29_gpu_banded_setvector_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI8_16__16_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEES10_3ValILi41EES18_ILi16EES18_ILi300EE
                    1.68%  506.93ms      2268  223.51us  216.00us  231.23us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi3EE11RefanovFluxIS14_ES18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    0.97%  293.28ms      1476  198.70us  190.75us  235.52us  _Z34julia_gpu_banded_setmatrix_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE29_gpu_banded_setmatrix_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI8_16__16_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19BatchedBandedMatrixILi16ELi300ELi20ELi20ELi1350E7Float6413CuDeviceArrayIS14_Li4ELi1EEE13ReshapedArrayIS14_Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S15_IS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES18_IS10_ES18_IS10_ES18_IS10_EEES10_3ValILi16EE
                    0.70%  212.60ms        36  5.9057ms  5.8851ms  5.9305ms  julia_broadcast_kernel_26293(CuKernelContext, CuDeviceArray<Float64, int=4, int=1>, Broadcasted<CuArrayStyle<int=4>, Tuple<OneTo<Int64>, CuArrayStyle<int=4, Tuple>, CuArrayStyle<int=4, Tuple>, CuArrayStyle<int=4, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=4, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple, Tuple>>, Float64>>, Tuple)
                    0.15%  45.608ms        38  1.2002ms  1.1956ms  1.2071ms  _Z28julia_broadcast_kernel_2023315CuKernelContext11StructArrayI6SArrayI5TupleILi9EE7Float64Li1ELi9EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE17_sphere_auxiliaryS2_I10CuRefValueI19EulerTotalEnergyLawIS3_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EES9_I10NamedTupleI72__a_______g___R_d_______p____cp_d___cv_d___H___T_________gravc___mearth_S2_IS3_S3_S3_S3_S3_S3_S3_S3_S3_S5_S3_S3_S3_EEE8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS13_ES2_IS5_S5_EES12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_IS13_S13_ES2_IS5_S5_EEEES5_
                    0.14%  43.102ms        72  598.63us  589.40us  606.33us  julia_broadcast_kernel_23777(CuKernelContext, StructArray<SArray<Tuple<int=9>, Float64, int=1, int=9>, int=2, Tuple<CuDeviceArray<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=9>, Float64, int=1, int=9>>, CuDeviceArray<Tuple<int=9, SArray<Tuple<int=9>, Float64, int=1, int=9>>, int=2, int=1>>, _identity, Tuple<Extruded<StructArray<SArray<Tuple<int=9>, Tuple<int=9>, int=1, int=9>, int=2, Tuple<Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, SArray<Tuple<int=9>, Float64, int=1, int=9>>, Tuple<Bool, Float64<Tuple<int=9>, int=2, int=1>>, Tuple<SArray<Tuple<int=9>, Float64, int=1, int=9>, SArray<Tuple<int=9>, Float64, int=1, int=9>>>>>, SArray<Tuple<int=9>, Float64, int=1, int=9>)
                    0.07%  19.884ms        36  552.33us  548.00us  555.71us  _Z28julia_broadcast_kernel_3103615CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE9_identityS2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EEES8_ES2_I4BoolS13_ES2_IS8_S8_EEEES8_
                    0.05%  15.732ms        21  749.12us  1.9520us  1.6317ms  [CUDA memcpy DtoH]
                    0.03%  10.114ms        72  140.47us  137.82us  143.58us  _Z28julia_broadcast_kernel_3071715CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_IS3_8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_I4BoolS13_ES2_IS5_S5_EEEES5_
                    0.03%  9.8474ms         1  9.8474ms  9.8474ms  9.8474ms  _Z33julia_gpu_materializefaceindices_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE28_gpu_materializefaceindices_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__5_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE11LobattoCellI7Float645ArrayS8_ILi4ELi4ELi4EELi3ES8_I13CuDeviceArrayIS14_Li3ELi1EES15_IS14_Li3ELi1EES15_IS14_Li3ELi1EEES15_I6SArrayIS8_ILi3EES14_Li1ELi3EELi1ELi1EES8_I4KronIS8_I8DiagonalIS14_4OnesIS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EEEES17_IS8_IS18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEES17_IS8_IS15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEEES18_IS14_S15_IS14_Li1ELi1EEES18_IS14_S15_IS14_Li1ELi1EEES17_IS8_IS15_IS14_Li2ELi1EES15_IS14_Li2ELi1EES15_IS14_Li2ELi1EEEES8_IS8_IS15_IS10_Li3ELi1EEES8_IS15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EEES8_IS15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EEES8_IS10_S10_S10_S10_S10_S10_S10_S10_EEES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EE22GeneralSparseMatrixCSCI11PermutationILi4EES10_S15_IS21_ILi4EELi1ELi1EES15_IS10_Li1ELi1EEE
                    0.02%  7.2630ms        36  201.75us  199.20us  207.81us  _Z28julia_broadcast_kernel_3087115CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS10_ES2_IS5_S5_EES9_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_IS10_S10_ES2_IS5_S5_EEEES5_
                    0.02%  6.2732ms        36  174.26us  170.65us  178.37us  _Z28julia_broadcast_kernel_2656415CuKernelContext8SubArrayI7Float64Li3E13CuDeviceArrayIS1_Li4ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EES4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi3EES3_IS5_IS6_ES5_IS6_ES5_IS6_EE2__S3_I8ExtrudedIS2_IS1_Li3ELi1EES3_I4BoolS11_S11_ES3_IS6_S6_S6_EES6_EES6_
                    0.02%  5.1752ms        53  97.645us  2.6560us  856.47us  [CUDA memcpy HtoD]
                    0.02%  5.0332ms        36  139.81us  138.46us  140.80us  julia_getindex_kernel_26436(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, CuDeviceArray<Float64, int=4, int=1>, Tuple<Int64, CuDeviceArray<Float64, int=4, int=1>, CuDeviceArray<Float64, int=4, int=1>, CuDeviceArray<Float64, int=4, int=1>>, Slice<OneTo<CuDeviceArray<Float64, int=4, int=1>>>, CuDeviceArray<Float64, int=4, int=1>, Tuple<Int64<CuDeviceArray<Float64, int=4, int=1>>>, Tuple<Int64<CuDeviceArray<Float64, int=4, int=1>>>)
                    0.00%  1.4316ms         2  715.82us  714.17us  717.47us  julia_broadcast_kernel_23559(CuKernelContext, StructArray<SArray<Tuple<int=11>, Float64, int=1, int=11>, int=2, Tuple<CuDeviceArray<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=11>, Float64, int=1, int=11>>, CuDeviceArray<Tuple<int=11, SArray<Tuple<int=11>, Float64, int=1, int=11>>, int=2, int=1>>, __, Tuple<Extruded<StructArray<SArray<Tuple<int=11>, Tuple<int=11>, int=1, int=11>, int=2, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, SArray<Tuple<int=11>, Float64, int=1, int=11>>, Tuple<Bool, Float64<Tuple<int=11>, int=2, int=1>>, Tuple<SArray<Tuple<int=11>, Float64, int=1, int=11>, SArray<Tuple<int=11>, Float64, int=1, int=11>>>, Tuple<int=11>>>, SArray<Tuple<int=11>, Float64, int=1, int=11>)
                    0.00%  1.2405ms         2  620.25us  619.13us  621.37us  _Z28julia_broadcast_kernel_2297215CuKernelContext11StructArrayI6SArrayI5TupleILi6EE7Float64Li1ELi6EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE15_mean_variablesS2_I10CuRefValueI19EulerTotalEnergyLawIS3_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_I4BoolS15_ES2_IS5_S5_EES11_IS0_IS1_IS2_ILi9EES3_Li1ELi9EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_IS15_S15_ES2_IS5_S5_EEEES5_
                    0.00%  1.0412ms         2  520.59us  516.51us  524.67us  julia_broadcast_kernel_21637(CuKernelContext, StructArray<SArray<Tuple<int=9>, Float64, int=1, int=9>, int=2, Tuple<CuDeviceArray<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=9>, Float64, int=1, int=9>>, CuDeviceArray<Tuple<int=9, SArray<Tuple<int=9>, Float64, int=1, int=9>>, int=2, int=1>>, _auxiliary, Tuple<CuRefValue<EulerTotalEnergyLaw<Tuple<int=9>, int=3, int=5, _____1_4__grav___9_81__pde_level_balance___false_>>, Extruded<StructArray<SArray<Tuple<int=3>, Tuple<int=9>, int=1, int=3>, int=2, Tuple<Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, SArray<Tuple<int=9>, Float64, int=1, int=9>>, Tuple<Bool, Bool>, Tuple<SArray<Tuple<int=9>, Float64, int=1, int=9>, SArray<Tuple<int=9>, Float64, int=1, int=9>>>>>, SArray<Tuple<int=9>, Float64, int=1, int=9>)
                    0.00%  811.51us         9  90.168us  89.407us  91.327us  _Z18julia_gpu______4667ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______466_29616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  768.44us         9  85.382us  84.319us  87.487us  _Z18julia_gpu______4637ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______463_29216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  713.08us         9  79.231us  78.336us  80.639us  _Z18julia_gpu______4607ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______460_28816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  655.71us         1  655.71us  655.71us  655.71us  julia_broadcast_kernel_23225(CuKernelContext, StructArray<SArray<Tuple<int=11>, Float64, int=1, int=11>, int=2, Tuple<CuDeviceArray<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=11>, Float64, int=1, int=11>>, CuDeviceArray<Tuple<int=11, SArray<Tuple<int=11>, Float64, int=1, int=11>>, int=2, int=1>>, _second_moment_variables, Tuple<Extruded<StructArray<SArray<Tuple<int=6>, Tuple<int=11>, int=1, int=6>, int=2, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, SArray<Tuple<int=11>, Float64, int=1, int=11>>, Tuple<Bool, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>>, Tuple<SArray<Tuple<int=11>, Float64, int=1, int=11>, SArray<Tuple<int=11>, Float64, int=1, int=11>>>>>, SArray<Tuple<int=11>, Float64, int=1, int=11>)
                    0.00%  647.74us         9  71.970us  70.911us  72.992us  julia_broadcast_kernel_10937(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<CuDeviceArray<Float64, int=2, int=1, Broadcasted<int=2>, void, OneTo, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, CuDeviceArray<Float64, int=2, int=1, Broadcasted<int=2>, void, OneTo, CuArrayStyle<Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>>>, Tuple)
                    0.00%  523.45us         9  58.161us  57.376us  58.944us  _Z28julia_broadcast_kernel_1109815CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS6_ES5_IS6_EE2__S3_IS7_IS8_ILi2EEvS9_S3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS11_ES3_IS6_S6_EES10_IS2_IS1_Li2ELi1EES3_IS11_S11_ES3_IS6_S6_EEEES7_IS8_ILi2EEvS9_S3_IS6_S10_IS0_IS1_Li2ES2_IS1_Li3ELi1EES3_IS4_IS5_IS6_EES6_S4_IS5_IS6_EEELifalseEES3_IS11_S11_ES3_IS6_S6_EEEEEES6_
                    0.00%  518.17us         1  518.17us  518.17us  518.17us  _Z28julia_broadcast_kernel_3120415CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES3_EES8_
                    0.00%  476.54us         1  476.54us  476.54us  476.54us  _Z28julia_broadcast_kernel_1432715CuKernelContext11StructArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES12_IS5_IS3_Li2ELi1EES2_IS13_S13_ES2_IS8_S8_EEEES8_
                    0.00%  394.65us         1  394.65us  394.65us  394.65us  _Z27julia_broadcast_kernel_982215CuKernelContext11StructArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE15_cubespherewarpS2_I8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_I13ReshapedArrayIS3_Li2E8SubArrayIS3_Li4ES4_IS3_Li5ELi1EES2_I5SliceIS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_I27SignedMultiplicativeInverseIS5_ES13_IS5_ES13_IS5_EEES10_IS3_Li2ES11_IS3_Li4ES4_IS3_Li5ELi1EES2_IS12_IS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_IS13_IS5_ES13_IS5_ES13_IS5_EEES10_IS3_Li2ES11_IS3_Li4ES4_IS3_Li5ELi1EES2_IS12_IS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_IS13_IS5_ES13_IS5_ES13_IS5_EEEE14CartesianIndexILi2EEES2_I4BoolS15_ES2_IS5_S5_EEEES5_
                    0.00%  330.91us         2  165.45us  164.54us  166.37us  _Z28julia_broadcast_kernel_2339215CuKernelContext11StructArrayI6SArrayI5TupleILi6EE7Float64Li1ELi6EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi6EES3_Li1ELi6EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS10_ES2_IS5_S5_EES3_EES5_
                    0.00%  259.33us        10  25.932us  25.024us  28.992us  _Z31julia_linear_copy_kernel__3135815CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE5Int648SubArrayIS1_Li2ES0_IS1_Li3ELi1EE5TupleI5SliceI5OneToIS2_EES2_S5_IS6_IS2_EEELifalseEES2_S2_
                    0.00%  225.79us         6  37.631us  35.776us  46.112us  _Z28julia_broadcast_kernel_1418915CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS6_ES5_IS6_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS11_ES3_IS6_S6_EEEES6_
                    0.00%  154.88us         4  38.720us  31.872us  45.472us  julia_broadcast_kernel_21440(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=1, int=1>, CuArrayStyle<Bool>, CuArrayStyle<Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
                    0.00%  144.74us         4  36.183us  27.680us  43.744us  _Z31julia_linear_copy_kernel__2134115CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE5Int648SubArrayIS1_Li2ES0_IS1_Li3ELi1EE5TupleI5SliceI5OneToIS2_EES2_S5_IS6_IS2_EEELifalseEES2_S2_
                    0.00%  142.59us         1  142.59us  142.59us  142.59us  _Z28julia_broadcast_kernel_1076715CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedIvS3_IS5_IS6_ES5_IS6_EE4_detS3_I8ExtrudedI11StructArrayI6SArrayIS3_ILi3ELi3EES1_Li2ELi9EELi2ES3_IS2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EEES6_ES3_I4BoolS12_ES3_IS6_S6_EEEES6_
                    0.00%  107.10us         1  107.10us  107.10us  107.10us  _Z18julia_gpu______3257ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE16_gpu______325_9616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE11StructArrayI6SArrayIS8_ILi3EE7Float64Li1ELi3EELi4ES8_I8SubArrayIS14_Li4E13CuDeviceArrayIS14_Li5ELi1EES8_I5SliceIS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES15_IS14_Li4ES16_IS14_Li5ELi1EES8_IS17_IS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES15_IS14_Li4ES16_IS14_Li5ELi1EES8_IS17_IS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEEE14CartesianIndexILi4EEES16_IS8_IS10_S10_S10_S10_S10_S10_S10_S10_ELi1ELi1EES16_IS14_Li1ELi1EES16_IS14_Li1ELi1EES16_IS14_Li1ELi1EES16_IS13_IS8_ILi3EES14_Li1ELi3EELi1ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  106.85us         4  26.711us  23.679us  29.599us  _Z34julia_partial_mapreduce_grid_226679_identity8_add_sum7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  104.80us         1  104.80us  104.80us  104.80us  _Z28julia_broadcast_kernel_2045415CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedIv5TupleI5OneToI5Int64ES4_IS5_EE9_pressureS3_I10CuRefValueI19EulerTotalEnergyLawIS1_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedI11StructArrayI6SArrayIS3_ILi5EES1_Li1ELi5EELi2ES3_I8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES3_I5SliceIS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEEE14CartesianIndexILi2EEES3_I4BoolS15_ES3_IS5_S5_EES9_IS10_IS11_IS3_ILi9EES1_Li1ELi9EELi2ES3_IS0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EEES5_ES3_IS15_S15_ES3_IS5_S5_EEEES5_
                    0.00%  87.263us         2  43.631us  43.391us  43.872us  _Z28julia_broadcast_kernel_2253715CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedI12CuArrayStyleILi2EE5TupleI5OneToI5Int64ES5_IS6_EE2__S4_I8ExtrudedI8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES4_I5SliceIS5_IS6_EES6_S10_IS5_IS6_EEELifalseEES4_I4BoolS11_ES4_IS6_S6_EES8_IS0_IS1_Li2ELi1EES4_IS11_S11_ES4_IS6_S6_EEEES6_
                    0.00%  85.759us         3  28.586us  27.680us  29.152us  _Z34julia_partial_mapreduce_grid_219859_identity4_min7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  84.959us         1  84.959us  84.959us  84.959us  _Z28julia_broadcast_kernel_1405515CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedIv5TupleI5OneToI5Int64ES4_IS5_EE5_normS3_I8ExtrudedI11StructArrayI6SArrayIS3_ILi3EES1_Li1ELi3EELi2ES3_I8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES3_I5SliceIS4_IS5_EES5_S11_IS4_IS5_EEELifalseEES10_IS1_Li2ES0_IS1_Li3ELi1EES3_IS11_IS4_IS5_EES5_S11_IS4_IS5_EEELifalseEES10_IS1_Li2ES0_IS1_Li3ELi1EES3_IS11_IS4_IS5_EES5_S11_IS4_IS5_EEELifalseEEE14CartesianIndexILi2EEES3_I4BoolS13_ES3_IS5_S5_EEEES5_
                    0.00%  63.424us         2  31.712us  31.488us  31.936us  julia_broadcast_kernel_21535(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Tuple, Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
                    0.00%  59.360us         1  59.360us  59.360us  59.360us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi1EE
                    0.00%  57.119us         1  57.119us  57.119us  57.119us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi2EE
                    0.00%  55.232us         1  55.232us  55.232us  55.232us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi3EE
                    0.00%  48.832us         3  16.277us  4.8650us  23.264us  julia__5_14579(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, Int64)
                    0.00%  44.096us         1  44.096us  44.096us  44.096us  _Z28julia_broadcast_kernel_2063915CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedI12CuArrayStyleILi2EE5TupleI5OneToI5Int64ES5_IS6_EE11_soundspeedS4_I10CuRefValueI19EulerTotalEnergyLawIS1_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedI8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES4_I5SliceIS5_IS6_EES6_S12_IS5_IS6_EEELifalseEES4_I4BoolS13_ES4_IS6_S6_EES10_IS0_IS1_Li2ELi1EES4_IS13_S13_ES4_IS6_S6_EEEES6_
                    0.00%  39.744us         1  39.744us  39.744us  39.744us  _Z18julia_gpu______3377ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______337_11216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  39.424us         1  39.424us  39.424us  39.424us  _Z18julia_gpu______3407ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______340_11616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  39.263us         1  39.263us  39.263us  39.263us  _Z18julia_gpu______3737ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______373_16016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  39.136us         1  39.136us  39.136us  39.136us  _Z18julia_gpu______3767ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______376_16416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  39.072us         1  39.072us  39.072us  39.072us  _Z18julia_gpu______3587ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______358_14016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  38.784us         1  38.784us  38.784us  38.784us  _Z18julia_gpu______3557ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______355_13616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  29.120us         3  9.7060us  7.4240us  10.880us  _Z34julia_partial_mapreduce_grid_221149_identity4_min7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  27.295us         4  6.8230us  5.1520us  9.5360us  _Z34julia_partial_mapreduce_grid_227969_identity8_add_sum7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  26.976us         1  26.976us  26.976us  26.976us  _Z18julia_gpu______3467ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______346_12416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.911us         1  26.911us  26.911us  26.911us  _Z34julia_partial_mapreduce_grid_209079_identity4_max7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  26.848us         1  26.848us  26.848us  26.848us  _Z18julia_gpu______3437ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______343_12016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.816us         1  26.816us  26.816us  26.816us  _Z18julia_gpu______3797ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______379_16816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.624us         1  26.624us  26.624us  26.624us  _Z18julia_gpu______3617ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______361_14416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.624us         1  26.624us  26.624us  26.624us  _Z18julia_gpu______3827ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______382_17216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.560us         1  26.560us  26.560us  26.560us  _Z18julia_gpu______3647ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______364_14816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.432us         1  26.432us  26.432us  26.432us  _Z18julia_gpu______3527ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______352_13216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.368us         1  26.368us  26.368us  26.368us  _Z18julia_gpu______3677ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______367_15216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  26.112us         1  26.112us  26.112us  26.112us  _Z18julia_gpu______3497ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______349_12816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  25.952us         1  25.952us  25.952us  25.952us  _Z18julia_gpu______3707ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______370_15616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  25.920us         1  25.920us  25.920us  25.920us  _Z18julia_gpu______3857ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______385_17616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  25.632us         1  25.632us  25.632us  25.632us  _Z18julia_gpu______3887ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______388_18016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  20.734us         5  4.1460us  3.5200us  4.6070us  julia__9_4134(CuKernelContext, CuDeviceArray<Float64, int=1, int=1>, CuDeviceArray<Float64, int=1, int=1>, Int64, CuDeviceArray<Float64, int=1, int=1>)
                    0.00%  18.304us         4  4.5760us  4.1920us  5.0880us  julia_getindex_kernel_7402(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, Slice<OneTo<Int64>>, Int64, CuDeviceArray<Int64<Tuple<Int64>>, int=5, int=1>, Int64, Int64)
                    0.00%  18.143us         4  4.5350us  4.2240us  4.8320us  julia_getindex_kernel_7669(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, StepRange<Int64, Int64>, Int64, CuDeviceArray<Int64<Int64, Int64>, int=5, int=1>, Int64, Int64)
                    0.00%  17.120us         4  4.2800us  3.9360us  4.6080us  _Z27julia_broadcast_kernel_752815CuKernelContext8SubArrayI5Int64Li2E13CuDeviceArrayIS1_Li5ELi1EE5TupleIS1_5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES1_ELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS10_ES3_IS1_S1_EEEES1_
                    0.00%  16.894us         5  3.3780us  3.2950us  3.5200us  julia_setindex_kernel_4355(CuKernelContext, CuDeviceArray<Float64, int=1, int=1>, CuDeviceArray<Float64, int=1, int=1>, Tuple<Int64>, CuDeviceArray<Float64, int=1, int=1>, UnitRange<CuDeviceArray<Float64, int=1, int=1>>)
                    0.00%  16.352us         4  4.0880us  3.9040us  4.2880us  _Z27julia_broadcast_kernel_777815CuKernelContext8SubArrayI5Int64Li2E13CuDeviceArrayIS1_Li5ELi1EE5TupleI5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES1_S1_ELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS10_ES3_IS1_S1_EEEES1_
                    0.00%  16.032us         1  16.032us  16.032us  16.032us  _Z18julia_gpu______4357ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______435_250I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToIS4_ES10_IS4_ES10_IS4_ES10_IS4_EEE7NDRangeILi4ES6_S6_S8_ILi4ES9_IS10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_EEES8_ILi4ES9_IS10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_EEEEE13CuDeviceArrayIS9_IS4_S4_S4_S4_S4_S4_S4_S4_ELi4ELi1EES12_IS9_IS4_S4_S4_S4_ELi3ELi1EES10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  13.440us         3  4.4800us  4.0960us  4.6720us  julia_broadcast_kernel_4011(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<CuArrayStyle<int=3>, Tuple<OneTo<Int64>, CuArrayStyle<int=3, Tuple>, CuArrayStyle<int=3, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>>>, Tuple)
                    0.00%  10.496us         1  10.496us  10.496us  10.496us  _Z35julia_gpu_materializeboundaryfaces_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE30_gpu_materializeboundaryfaces_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE11LobattoCellI7Float645ArrayS8_ILi4ELi4ELi4EELi3ES8_I13CuDeviceArrayIS14_Li3ELi1EES15_IS14_Li3ELi1EES15_IS14_Li3ELi1EEES15_I6SArrayIS8_ILi3EES14_Li1ELi3EELi1ELi1EES8_I4KronIS8_I8DiagonalIS14_4OnesIS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EEEES17_IS8_IS18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEES17_IS8_IS15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEEES18_IS14_S15_IS14_Li1ELi1EEES18_IS14_S15_IS14_Li1ELi1EEES17_IS8_IS15_IS14_Li2ELi1EES15_IS14_Li2ELi1EES15_IS14_Li2ELi1EEEES8_IS8_IS15_IS10_Li3ELi1EEES8_IS15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EEES8_IS15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EEES8_IS10_S10_S10_S10_S10_S10_S10_S10_EEES15_IS10_Li2ELi1EE22GeneralSparseMatrixCSCI11PermutationILi4EES10_S15_IS21_ILi4EELi1ELi1EES15_IS10_Li1ELi1EEE
                    0.00%  10.271us         1  10.271us  10.271us  10.271us  _Z18julia_gpu______4327ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______432_24616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI6SArrayIS8_ILi3EE7Float64Li1ELi3EELi4ELi1EES12_IS13_IS8_ILi3EES14_Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  9.9530us         2  4.9760us  4.8650us  5.0880us  julia_getindex_kernel_7094(CuKernelContext, CuDeviceArray<Int64, int=3, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, Int64, Slice<OneTo<Int64>>, Int64, CuDeviceArray<Int64<Tuple<Int64>>, int=5, int=1>, CuDeviceArray<Int64, int=1, int=1>)
                    0.00%  8.9270us         1  8.9270us  8.9270us  8.9270us  _Z18julia_gpu______4287ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______428_24116CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayIS8_IS10_S10_S10_S10_ELi3ELi1EES12_IS10_Li5ELi1EES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  8.7680us         2  4.3840us  4.0960us  4.6720us  _Z27julia_broadcast_kernel_724815CuKernelContext8SubArrayI5Int64Li3E13CuDeviceArrayIS1_Li5ELi1EE5TupleIS1_5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES2_IS1_Li1ELi1EEELifalseEE11BroadcastedI12CuArrayStyleILi3EES3_IS5_IS1_ES5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li3ELi1EES3_I4BoolS10_S10_ES3_IS1_S1_S1_EEEES1_
                    0.00%  8.6720us         1  8.6720us  8.6720us  8.6720us  _Z18julia_gpu______4257ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______425_23716CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi5E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi5ES5_S5_S7_ILi5ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi5ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayIS10_Li5ELi1EES12_IS10_Li3ELi1EE9UnitRangeIS10_ES13_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  7.8080us         1  7.8080us  7.8080us  7.8080us  _Z27julia_broadcast_kernel_252715CuKernelContext13CuDeviceArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi3ELi1EE11BroadcastedI12CuArrayStyleILi3EES2_I5OneToI5Int64ES6_IS7_ES6_IS7_EE8_328_329I26SVector_S__T__where__S__T_ES2_I8ExtrudedIS0_IS3_Li3ELi1EES2_I4BoolS10_S10_ES2_IS7_S7_S7_EES9_IS0_IS3_Li3ELi1EES2_IS10_S10_S10_ES2_IS7_S7_S7_EES9_IS0_IS3_Li3ELi1EES2_IS10_S10_S10_ES2_IS7_S7_S7_EEEES7_
                    0.00%  7.2960us         1  7.2960us  7.2960us  7.2960us  _Z34julia_partial_mapreduce_grid_211139_identity4_max7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  5.3430us         1  5.3430us  5.3430us  5.3430us  julia_broadcast_kernel_3872(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<CuArrayStyle<int=3>, Tuple<OneTo<Int64>, CuArrayStyle<int=3, Tuple>, CuArrayStyle<int=3, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>>>, Tuple)
                    0.00%  4.6720us         1  4.6720us  4.6720us  4.6720us  _Z18julia_gpu______4017ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______401_213I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.6400us         1  4.6400us  4.6400us  4.6400us  _Z18julia_gpu______4177ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______417_229I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.6080us         1  4.6080us  4.6080us  4.6080us  _Z18julia_gpu______4137ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______413_225I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.6080us         1  4.6080us  4.6080us  4.6080us  _Z18julia_gpu______4217ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______421_233I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.5760us         1  4.5760us  4.5760us  4.5760us  _Z18julia_gpu______4097ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______409_221I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.5450us         1  4.5450us  4.5450us  4.5450us  _Z18julia_gpu______4057ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______405_217I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  4.3840us         1  4.3840us  4.3840us  4.3840us  _Z27julia_broadcast_kernel_507715CuKernelContext13CuDeviceArrayI6SArrayI5TupleILi2EE7Float64Li1ELi2EELi2ELi1EE11BroadcastedI12CuArrayStyleILi2EES2_I5OneToI5Int64ES6_IS7_EE8_328_329I26SVector_S__T__where__S__T_ES2_I8ExtrudedIS0_IS3_Li2ELi1EES2_I4BoolS10_ES2_IS7_S7_EES9_IS0_IS3_Li2ELi1EES2_IS10_S10_ES2_IS7_S7_EEEES7_
                    0.00%  4.2240us         1  4.2240us  4.2240us  4.2240us  julia_broadcast_kernel_5349(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
      API calls:   28.64%  365.95ms     36006  10.163us  4.4300us  2.8136ms  cuLaunchKernel
                   18.60%  237.72ms         1  237.72ms  237.72ms  237.72ms  cuDevicePrimaryCtxRetain
                   17.41%  222.44ms      1526  145.77us     840ns  21.865ms  cuMemAllocAsync
                    5.81%  74.235ms     10516  7.0590us  4.0600us  8.6593ms  cuLaunchHostFunc
                    5.54%  70.753ms     44268  1.5980us     360ns  85.769us  cuStreamQuery
                    5.10%  65.178ms    298761     218ns     140ns  94.330us  cuCtxGetCurrent
                    3.65%  46.693ms     36302  1.2860us     290ns  22.005ms  cuEventDestroy
                    3.01%  38.412ms     36302  1.0580us     629ns  162.36us  cuEventRecord
                    2.84%  36.241ms     37739     960ns     580ns  100.15us  cuStreamWaitEvent
                    2.28%  29.178ms     36302     803ns     320ns  1.9178ms  cuEventCreate
                    1.82%  23.251ms       101  230.20us  105.42us  3.2107ms  cuModuleLoadDataEx
                    1.63%  20.787ms        21  989.84us  16.560us  2.2136ms  cuMemcpyDtoHAsync
                    1.21%  15.417ms      8893  1.7330us     320ns  11.810us  cuOccupancyMaxPotentialBlockSize
                    0.85%  10.862ms     10517  1.0320us     610ns  70.360us  cuEventQuery
                    0.43%  5.4933ms        53  103.65us  4.9600us  862.30us  cuMemcpyHtoDAsync
                    0.32%  4.0261ms       101  39.862us  12.999us  465.43us  cuModuleUnload
                    0.26%  3.3651ms      1526  2.2050us     920ns  590.63us  cuMemFreeAsync
                    0.22%  2.8380ms       101  28.099us  12.750us  982.27us  cuMemHostAlloc
                    0.16%  1.9830ms       202  9.8160us  1.1100us  26.940us  cuCtxSynchronize
                    0.08%  979.78us         2  489.89us     300ns  979.48us  cuDeviceGetMemPool
                    0.07%  930.21us        18  51.678us  2.4400us  623.73us  cuStreamCreate
                    0.02%  308.06us      1528     201ns     160ns  7.1700us  cuCtxGetDevice
                    0.01%  174.52us        74  2.3580us     290ns  9.3000us  cuPointerGetAttribute
                    0.01%  160.53us       101  1.5890us     710ns  4.2300us  cuModuleGetFunction
                    0.01%  143.19us        18  7.9540us  3.1400us  28.819us  cuStreamDestroy
                    0.01%  121.95us       101  1.2070us     720ns  5.5000us  cuMemHostGetDevicePointer
                    0.01%  114.64us        21  5.4590us  2.3700us  11.420us  cuStreamSynchronize
                    0.01%  87.879us       374     234ns     140ns  2.3200us  cuDeviceGetAttribute
                    0.00%  14.040us        13  1.0800us     250ns  2.0400us  cuDeviceGetCount
                    0.00%  10.250us         5  2.0500us     520ns  3.2500us  cuCtxSetCurrent
                    0.00%  6.9400us        11     630ns     140ns  1.8200us  cuDriverGetVersion
                    0.00%  6.8900us         1  6.8900us  6.8900us  6.8900us  cuDeviceGetPCIBusId
                    0.00%  3.6500us         1  3.6500us  3.6500us  3.6500us  cuMemPoolSetAttribute
                    0.00%  1.4700us         3     490ns     190ns     700ns  cuDeviceGet

Running

nvprof julia-1.6 --project experiment/euler_gravity/held_suarez_deep.jl

gives

==3383== Profiling application: julia-1.6 --project experiment/euler_gravity/held_suarez_deep.jl
==3383== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   27.33%  15.1559s      1512  10.024ms  9.9835ms  10.196ms  _Z33julia_gpu_banded_backward_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE28_gpu_banded_backward_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350ES14_S16_IS14_Li4ELi1EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE
                   26.14%  14.4974s      1512  9.5882ms  9.5563ms  9.6714ms  _Z32julia_gpu_banded_forward_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE27_gpu_banded_forward_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350ES14_S16_IS14_Li4ELi1EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEE
                   11.08%  6.14791s        36  170.78ms  168.42ms  180.72ms  _Z26julia_gpu_bandedlu_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_bandedlu_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI5_16__ES7_ILi1ES8_IS9_IS10_EEEvEE15BatchedBandedLUILi16ELi300ELi20ELi20ELi1350E7Float6413CuDeviceArrayIS14_Li4ELi1EEE
                    9.32%  5.17069s      3744  1.3811ms  1.1046ms  2.2809ms  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE27LinearizedKennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi3EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILifalseEE
                    3.99%  2.21097s      2268  974.85us  947.29us  1.1841ms  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi1EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILifalseEE
                    3.72%  2.06478s      4536  455.20us  442.65us  469.24us  _Z28julia_broadcast_kernel_3018415CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES10_I16StructArrayStyleI12CuArrayStyleILi2EEEvS11_S2_IS3_S10_IS14_IS15_ILi2EEEvS11_S2_IS10_IS14_IS15_ILi2EEEvS11_S2_IS3_S12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEES10_IS14_IS15_ILi2EEEvS11_S2_IS3_S12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEEEEEEEES8_
                    2.59%  1.43632s      2268  633.30us  619.10us  796.70us  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi2EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILitrueEE
                    2.56%  1.42024s      2268  626.21us  610.55us  796.44us  _Z26julia_gpu_volume_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE21_gpu_volume_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_10StaticSizeI9_4__4__4_ES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS14_Li2ELi1EE17KennedyGruberFluxS15_I10NamedTupleI8__g___J_S8_IS16_IS8_ILi3ELi3EES14_Li2ELi9EES14_EELi2ES22_I8__g___J_S8_IS15_IS16_IS8_ILi3ELi3EES14_Li2ELi9EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEEES20_ILi2EEES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_E4Bool3ValILi3EES24_ILi3EES24_ILi4EES24_ILi4EES24_ILi4EES24_ILi5EES24_ILi9EES24_ILitrueEE
                    2.50%  1.38931s      1476  941.26us  932.50us  950.14us  _Z34julia_gpu_banded_setvector_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE29_gpu_banded_setvector_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI8_16__16_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE13ReshapedArrayI7Float64Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_13CuDeviceArrayIS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES17_IS10_ES17_IS10_ES17_IS10_EEES13_IS14_Li3ES15_IS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S16_IS14_Li5ELi1EEES8_IS17_IS10_ES17_IS10_ES17_IS10_ES17_IS10_EEES10_3ValILi41EES18_ILi16EES18_ILi300EE
                    2.06%  1.14099s      3744  304.75us  272.61us  399.07us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi3EE21LinearizedRefanovFluxIS14_ES18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    2.00%  1.11186s      2268  490.24us  483.07us  500.12us  _Z28julia_broadcast_kernel_3000615CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_IS13_S13_ES2_IS8_S8_EEEES8_
                    1.33%  737.06ms      1553  474.61us  469.92us  493.95us  _Z28julia_broadcast_kernel_1997715CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE9_identityS2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EEEES8_
                    1.31%  727.87ms      2268  320.93us  315.55us  328.13us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi1EE7RoeFluxS18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    1.24%  685.92ms      2268  302.43us  296.35us  308.22us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi2EE7RoeFluxS18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    1.17%  648.54ms      1476  439.39us  425.37us  465.47us  _Z34julia_gpu_banded_setmatrix_kernel_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE29_gpu_banded_setmatrix_kernel_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI8_16__16_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19BatchedBandedMatrixILi16ELi300ELi20ELi20ELi1350E7Float6413CuDeviceArrayIS14_Li4ELi1EEE13ReshapedArrayIS14_Li3E17PermutedDimsArrayIS14_Li5E15_1__3__2__4__5_15_1__3__2__4__5_S15_IS14_Li5ELi1EEES8_I27SignedMultiplicativeInverseIS10_ES18_IS10_ES18_IS10_ES18_IS10_EEES10_3ValILi16EE
                    0.95%  529.27ms      2268  233.36us  226.46us  250.46us  _Z27julia_gpu_surface_term_dir_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE22_gpu_surface_term_dir_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__2_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE19EulerTotalEnergyLawI7Float64Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_E11StructArrayI6SArrayIS8_ILi5EES14_Li1ELi5EELi2ES8_I8SubArrayIS14_Li2E13CuDeviceArrayIS14_Li3ELi1EES8_I5SliceIS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEE14CartesianIndexILi2EEES15_IS16_IS8_ILi5EES14_Li1ELi5EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEE3ValI27_0__16__32__48__64__80__96_ES21_ILi3EE11RefanovFluxIS14_ES18_IS14_Li2ELi1EES18_IS10_Li2ELi1EES18_IS10_Li2ELi1EES18_IS14_Li2ELi1EES15_IS16_IS8_ILi3EES14_Li1ELi3EELi2ES8_IS17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEES17_IS14_Li2ES18_IS14_Li3ELi1EES8_IS19_IS9_IS10_EES10_S19_IS9_IS10_EEELifalseEEES20_ILi2EEES18_IS10_Li2ELi1EES15_IS16_IS8_ILi9EES14_Li1ELi9EELi2ES8_IS18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EES18_IS14_Li2ELi1EEES10_ES21_ILi3EE
                    0.38%  212.38ms        36  5.8995ms  5.8838ms  5.9240ms  julia_broadcast_kernel_26299(CuKernelContext, CuDeviceArray<Float64, int=4, int=1>, Broadcasted<CuArrayStyle<int=4>, Tuple<OneTo<Int64>, CuArrayStyle<int=4, Tuple>, CuArrayStyle<int=4, Tuple>, CuArrayStyle<int=4, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=4, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple, Tuple>>, Float64>>, Tuple)
                    0.08%  46.322ms        38  1.2190ms  1.2039ms  1.2264ms  _Z28julia_broadcast_kernel_2027215CuKernelContext11StructArrayI6SArrayI5TupleILi9EE7Float64Li1ELi9EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE17_sphere_auxiliaryS2_I10CuRefValueI19EulerTotalEnergyLawIS3_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EES9_I10NamedTupleI72__a_______g___R_d_______p____cp_d___cv_d___H___T_________gravc___mearth_S2_IS3_S3_S3_S3_S3_S3_S3_S3_S3_S5_S3_S3_S3_EEE8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS13_ES2_IS5_S5_EES12_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEES14_IS3_Li2ES4_IS3_Li3ELi1EES2_IS15_IS7_IS5_EES5_S15_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_IS13_S13_ES2_IS5_S5_EEEES5_
                    0.08%  43.736ms        72  607.45us  596.99us  615.42us  julia_broadcast_kernel_23792(CuKernelContext, StructArray<SArray<Tuple<int=9>, Float64, int=1, int=9>, int=2, Tuple<CuDeviceArray<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=9>, Float64, int=1, int=9>>, CuDeviceArray<Tuple<int=9, SArray<Tuple<int=9>, Float64, int=1, int=9>>, int=2, int=1>>, _identity, Tuple<Extruded<StructArray<SArray<Tuple<int=9>, Tuple<int=9>, int=1, int=9>, int=2, Tuple<Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>, SArray<Tuple<int=9>, Float64, int=1, int=9>>, Tuple<Bool, Tuple<Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>, Float64<Tuple<int=9>, int=2, int=1>>>, Tuple<SArray<Tuple<int=9>, Float64, int=1, int=9>, SArray<Tuple<int=9>, Float64, int=1, int=9>>>>>, SArray<Tuple<int=9>, Float64, int=1, int=9>)
                    0.04%  19.828ms        36  550.77us  544.70us  557.05us  _Z28julia_broadcast_kernel_3103315CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE9_identityS2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EES5_IS3_Li2ELi1EEES8_ES2_I4BoolS13_ES2_IS8_S8_EEEES8_
                    0.02%  10.001ms         1  10.001ms  10.001ms  10.001ms  _Z33julia_gpu_materializefaceindices_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE28_gpu_materializefaceindices_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES9_IS10_EEE7NDRangeILi2ES5_10StaticSizeI7_16__5_ES7_ILi2ES8_IS9_IS10_ES9_IS10_EEEvEE11LobattoCellI7Float645ArrayS8_ILi4ELi4ELi4EELi3ES8_I13CuDeviceArrayIS14_Li3ELi1EES15_IS14_Li3ELi1EES15_IS14_Li3ELi1EEES15_I6SArrayIS8_ILi3EES14_Li1ELi3EELi1ELi1EES8_I4KronIS8_I8DiagonalIS14_4OnesIS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EEEES17_IS8_IS18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEES17_IS8_IS15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEEES18_IS14_S15_IS14_Li1ELi1EEES18_IS14_S15_IS14_Li1ELi1EEES17_IS8_IS15_IS14_Li2ELi1EES15_IS14_Li2ELi1EES15_IS14_Li2ELi1EEEES8_IS8_IS15_IS10_Li3ELi1EEES8_IS15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EEES8_IS15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EEES8_IS10_S10_S10_S10_S10_S10_S10_S10_EEES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EE22GeneralSparseMatrixCSCI11PermutationILi4EES10_S15_IS21_ILi4EELi1ELi1EES15_IS10_Li1ELi1EEE
                    0.02%  9.9425ms        16  621.41us  1.9200us  1.9931ms  [CUDA memcpy DtoH]
                    0.02%  9.9299ms        72  137.92us  136.64us  140.00us  _Z28julia_broadcast_kernel_3071415CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_IS3_8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEES10_IS3_Li2ES4_IS3_Li3ELi1EES2_IS11_IS7_IS5_EES5_S11_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_I4BoolS13_ES2_IS5_S5_EEEES5_
                    0.01%  7.0730ms        36  196.47us  193.85us  199.74us  _Z28julia_broadcast_kernel_3086815CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS10_ES2_IS5_S5_EES9_IS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_IS10_S10_ES2_IS5_S5_EEEES5_
                    0.01%  5.9889ms        36  166.36us  164.83us  170.69us  _Z28julia_broadcast_kernel_2657015CuKernelContext8SubArrayI7Float64Li3E13CuDeviceArrayIS1_Li4ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EES4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi3EES3_IS5_IS6_ES5_IS6_ES5_IS6_EE2__S3_I8ExtrudedIS2_IS1_Li3ELi1EES3_I4BoolS11_S11_ES3_IS6_S6_S6_EES6_EES6_
                    0.01%  5.1706ms        53  97.558us  2.6240us  852.54us  [CUDA memcpy HtoD]
                    0.01%  5.0196ms        36  139.43us  137.57us  140.19us  julia_getindex_kernel_26442(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, CuDeviceArray<Float64, int=4, int=1>, Tuple<Int64, CuDeviceArray<Float64, int=4, int=1>, CuDeviceArray<Float64, int=4, int=1>, CuDeviceArray<Float64, int=4, int=1>>, Slice<OneTo<CuDeviceArray<Float64, int=4, int=1>>>, CuDeviceArray<Float64, int=4, int=1>, Tuple<Int64<CuDeviceArray<Float64, int=4, int=1>>>, Tuple<Int64<CuDeviceArray<Float64, int=4, int=1>>>)
                    0.00%  1.4372ms         2  718.60us  716.28us  720.92us  julia_broadcast_kernel_23574(CuKernelContext, StructArray<SArray<Tuple<int=11>, Float64, int=1, int=11>, int=2, Tuple<CuDeviceArray<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=11>, Float64, int=1, int=11>>, CuDeviceArray<Tuple<int=11, SArray<Tuple<int=11>, Float64, int=1, int=11>>, int=2, int=1>>, __, Tuple<Extruded<StructArray<SArray<Tuple<int=11>, Tuple<int=11>, int=1, int=11>, int=2, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, SArray<Tuple<int=11>, Float64, int=1, int=11>>, Tuple<Bool, Float64<Tuple<int=11>, int=2, int=1>>, Tuple<SArray<Tuple<int=11>, Float64, int=1, int=11>, SArray<Tuple<int=11>, Float64, int=1, int=11>>>, Tuple<int=11>>>, SArray<Tuple<int=11>, Float64, int=1, int=11>)
                    0.00%  1.2563ms         2  628.17us  627.67us  628.67us  _Z28julia_broadcast_kernel_2298715CuKernelContext11StructArrayI6SArrayI5TupleILi6EE7Float64Li1ELi6EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE15_mean_variablesS2_I10CuRefValueI19EulerTotalEnergyLawIS3_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_I8SubArrayIS3_Li2ES4_IS3_Li3ELi1EES2_I5SliceIS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEES12_IS3_Li2ES4_IS3_Li3ELi1EES2_IS13_IS7_IS5_EES5_S13_IS7_IS5_EEELifalseEEE14CartesianIndexILi2EEES2_I4BoolS15_ES2_IS5_S5_EES11_IS0_IS1_IS2_ILi9EES3_Li1ELi9EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_IS15_S15_ES2_IS5_S5_EEEES5_
                    0.00%  1.0645ms         2  532.27us  529.08us  535.45us  _Z28julia_broadcast_kernel_2165215CuKernelContext11StructArrayI6SArrayI5TupleILi9EE7Float64Li1ELi9EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE10_auxiliaryS2_I10CuRefValueI19EulerTotalEnergyLawIS3_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS12_ES2_IS5_S5_EEEES5_
                    0.00%  927.67us         9  103.07us  102.14us  105.18us  _Z18julia_gpu______4667ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______466_29616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  912.82us         9  101.42us  100.35us  103.10us  _Z18julia_gpu______4637ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______463_29216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  830.55us         9  92.283us  91.135us  94.015us  _Z18julia_gpu______4607ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______460_28816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI7Float64Li4ELi1EES12_IS13_Li2ELi1EES12_IS13_Li4ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  666.68us         1  666.68us  666.68us  666.68us  julia_broadcast_kernel_23240(CuKernelContext, StructArray<SArray<Tuple<int=11>, Float64, int=1, int=11>, int=2, Tuple<CuDeviceArray<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, Int64>, Broadcasted<void, Tuple<OneTo<SArray<Tuple<int=11>, Float64, int=1, int=11>>, CuDeviceArray<Tuple<int=11, SArray<Tuple<int=11>, Float64, int=1, int=11>>, int=2, int=1>>, _second_moment_variables, Tuple<Extruded<StructArray<SArray<Tuple<int=6>, Tuple<int=11>, int=1, int=6>, int=2, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, SArray<Tuple<int=11>, Float64, int=1, int=11>>, Tuple<Bool, StructArray<SArray<Tuple<int=6>, Tuple<int=11>, int=1, int=6>, int=2, Tuple<Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>, Float64<Tuple<int=11>, int=2, int=1>>, SArray<Tuple<int=11>, Float64, int=1, int=11>>>, Tuple<SArray<Tuple<int=11>, Float64, int=1, int=11>, SArray<Tuple<int=11>, Float64, int=1, int=11>>>>>, SArray<Tuple<int=11>, Float64, int=1, int=11>)
                    0.00%  626.97us         9  69.663us  68.639us  70.816us  julia_broadcast_kernel_10986(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<CuDeviceArray<Float64, int=2, int=1, Broadcasted<int=2>, void, OneTo, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, CuDeviceArray<Float64, int=2, int=1, Broadcasted<int=2>, void, OneTo, CuArrayStyle<Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>>>, Tuple)
                    0.00%  525.21us         9  58.356us  57.407us  59.487us  _Z28julia_broadcast_kernel_1114715CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS6_ES5_IS6_EE2__S3_IS7_IS8_ILi2EEvS9_S3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS11_ES3_IS6_S6_EES10_IS2_IS1_Li2ELi1EES3_IS11_S11_ES3_IS6_S6_EEEES7_IS8_ILi2EEvS9_S3_IS6_S10_IS0_IS1_Li2ES2_IS1_Li3ELi1EES3_IS4_IS5_IS6_EES6_S4_IS5_IS6_EEELifalseEES3_IS11_S11_ES3_IS6_S6_EEEEEES6_
                    0.00%  504.57us         1  504.57us  504.57us  504.57us  _Z28julia_broadcast_kernel_3120115CuKernelContext11StructArrayI6SArrayI5TupleILi5EE7Float64Li1ELi5EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi5EES3_Li1ELi5EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES3_EES8_
                    0.00%  497.34us         1  497.34us  497.34us  497.34us  _Z28julia_broadcast_kernel_1437615CuKernelContext11StructArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi2ES2_I8SubArrayIS3_Li2E13CuDeviceArrayIS3_Li3ELi1EES2_I5SliceI5OneToI5Int64EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEE14CartesianIndexILi2EEE11BroadcastedIvS2_IS7_IS8_ES7_IS8_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_IS4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEES4_IS3_Li2ES5_IS3_Li3ELi1EES2_IS6_IS7_IS8_EES8_S6_IS7_IS8_EEELifalseEEES9_ILi2EEES2_I4BoolS13_ES2_IS8_S8_EES12_IS5_IS3_Li2ELi1EES2_IS13_S13_ES2_IS8_S8_EEEES8_
                    0.00%  394.88us         1  394.88us  394.88us  394.88us  _Z27julia_broadcast_kernel_986215CuKernelContext11StructArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE15_cubespherewarpS2_I8ExtrudedIS0_IS1_IS2_ILi3EES3_Li1ELi3EELi2ES2_I13ReshapedArrayIS3_Li2E8SubArrayIS3_Li4ES4_IS3_Li5ELi1EES2_I5SliceIS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_I27SignedMultiplicativeInverseIS5_ES13_IS5_ES13_IS5_EEES10_IS3_Li2ES11_IS3_Li4ES4_IS3_Li5ELi1EES2_IS12_IS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_IS13_IS5_ES13_IS5_ES13_IS5_EEES10_IS3_Li2ES11_IS3_Li4ES4_IS3_Li5ELi1EES2_IS12_IS7_IS5_EES12_IS7_IS5_EES12_IS7_IS5_EES5_S12_IS7_IS5_EEELifalseEES2_IS13_IS5_ES13_IS5_ES13_IS5_EEEE14CartesianIndexILi2EEES2_I4BoolS15_ES2_IS5_S5_EEEES5_
                    0.00%  322.97us         2  161.49us  160.25us  162.72us  _Z28julia_broadcast_kernel_2340715CuKernelContext11StructArrayI6SArrayI5TupleILi6EE7Float64Li1ELi6EELi2ES2_I13CuDeviceArrayIS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEE5Int64E11BroadcastedIvS2_I5OneToIS5_ES7_IS5_EE2__S2_I8ExtrudedIS0_IS1_IS2_ILi6EES3_Li1ELi6EELi2ES2_IS4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EES4_IS3_Li2ELi1EEES5_ES2_I4BoolS10_ES2_IS5_S5_EES3_EES5_
                    0.00%  218.97us         6  36.495us  34.527us  44.191us  _Z28julia_broadcast_kernel_1423815CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS6_ES5_IS6_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS11_ES3_IS6_S6_EEEES6_
                    0.00%  148.61us         4  37.151us  30.144us  45.152us  julia_broadcast_kernel_21455(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=1, int=1>, CuArrayStyle<Bool>, CuArrayStyle<Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
                    0.00%  148.16us         1  148.16us  148.16us  148.16us  _Z18julia_gpu______3257ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE16_gpu______325_9616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE11StructArrayI6SArrayIS8_ILi3EE7Float64Li1ELi3EELi4ES8_I8SubArrayIS14_Li4E13CuDeviceArrayIS14_Li5ELi1EES8_I5SliceIS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES15_IS14_Li4ES16_IS14_Li5ELi1EES8_IS17_IS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES15_IS14_Li4ES16_IS14_Li5ELi1EES8_IS17_IS9_IS10_EES17_IS9_IS10_EES17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEEE14CartesianIndexILi4EEES16_IS8_IS10_S10_S10_S10_S10_S10_S10_S10_ELi1ELi1EES16_IS14_Li1ELi1EES16_IS14_Li1ELi1EES16_IS14_Li1ELi1EES16_IS13_IS8_ILi3EES14_Li1ELi3EELi1ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  141.09us         1  141.09us  141.09us  141.09us  _Z28julia_broadcast_kernel_1081615CuKernelContext8SubArrayI7Float64Li2E13CuDeviceArrayIS1_Li3ELi1EE5TupleI5SliceI5OneToI5Int64EES6_S4_IS5_IS6_EEELifalseEE11BroadcastedIvS3_IS5_IS6_ES5_IS6_EE4_detS3_I8ExtrudedI11StructArrayI6SArrayIS3_ILi3ELi3EES1_Li2ELi9EELi2ES3_IS2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EES2_IS1_Li2ELi1EEES6_ES3_I4BoolS12_ES3_IS6_S6_EEEES6_
                    0.00%  135.13us         4  33.783us  25.375us  40.927us  _Z31julia_linear_copy_kernel__2135615CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE5Int648SubArrayIS1_Li2ES0_IS1_Li3ELi1EE5TupleI5SliceI5OneToIS2_EES2_S5_IS6_IS2_EEELifalseEES2_S2_
                    0.00%  121.98us         5  24.396us  23.456us  27.488us  _Z31julia_linear_copy_kernel__3135515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE5Int648SubArrayIS1_Li2ES0_IS1_Li3ELi1EE5TupleI5SliceI5OneToIS2_EES2_S5_IS6_IS2_EEELifalseEES2_S2_
                    0.00%  103.94us         1  103.94us  103.94us  103.94us  _Z28julia_broadcast_kernel_2049315CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedIv5TupleI5OneToI5Int64ES4_IS5_EE9_pressureS3_I10CuRefValueI19EulerTotalEnergyLawIS1_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedI11StructArrayI6SArrayIS3_ILi5EES1_Li1ELi5EELi2ES3_I8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES3_I5SliceIS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEES12_IS1_Li2ES0_IS1_Li3ELi1EES3_IS13_IS4_IS5_EES5_S13_IS4_IS5_EEELifalseEEE14CartesianIndexILi2EEES3_I4BoolS15_ES3_IS5_S5_EES9_IS10_IS11_IS3_ILi9EES1_Li1ELi9EELi2ES3_IS0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EES0_IS1_Li2ELi1EEES5_ES3_IS15_S15_ES3_IS5_S5_EEEES5_
                    0.00%  96.606us         4  24.151us  21.440us  26.303us  _Z34julia_partial_mapreduce_grid_226829_identity8_add_sum7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  88.479us         1  88.479us  88.479us  88.479us  _Z18julia_gpu______3737ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______373_16016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  88.255us         1  88.255us  88.255us  88.255us  _Z18julia_gpu______3557ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______355_13616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  86.975us         1  86.975us  86.975us  86.975us  _Z18julia_gpu______3407ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______340_11616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  86.047us         1  86.047us  86.047us  86.047us  _Z18julia_gpu______3377ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______337_11216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  84.384us         1  84.384us  84.384us  84.384us  _Z28julia_broadcast_kernel_1410415CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedIv5TupleI5OneToI5Int64ES4_IS5_EE5_normS3_I8ExtrudedI11StructArrayI6SArrayIS3_ILi3EES1_Li1ELi3EELi2ES3_I8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES3_I5SliceIS4_IS5_EES5_S11_IS4_IS5_EEELifalseEES10_IS1_Li2ES0_IS1_Li3ELi1EES3_IS11_IS4_IS5_EES5_S11_IS4_IS5_EEELifalseEES10_IS1_Li2ES0_IS1_Li3ELi1EES3_IS11_IS4_IS5_EES5_S11_IS4_IS5_EEELifalseEEE14CartesianIndexILi2EEES3_I4BoolS13_ES3_IS5_S5_EEEES5_
                    0.00%  83.071us         2  41.535us  40.959us  42.112us  _Z28julia_broadcast_kernel_2255215CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedI12CuArrayStyleILi2EE5TupleI5OneToI5Int64ES5_IS6_EE2__S4_I8ExtrudedI8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES4_I5SliceIS5_IS6_EES6_S10_IS5_IS6_EEELifalseEES4_I4BoolS11_ES4_IS6_S6_EES8_IS0_IS1_Li2ELi1EES4_IS11_S11_ES4_IS6_S6_EEEES6_
                    0.00%  82.975us         1  82.975us  82.975us  82.975us  _Z18julia_gpu______3587ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______358_14016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  81.215us         1  81.215us  81.215us  81.215us  _Z18julia_gpu______3767ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______376_16416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  80.416us         1  80.416us  80.416us  80.416us  _Z18julia_gpu______3647ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______364_14816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  78.496us         3  26.165us  24.768us  27.168us  _Z34julia_partial_mapreduce_grid_220009_identity4_min7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  75.999us         1  75.999us  75.999us  75.999us  _Z18julia_gpu______3467ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______346_12416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  74.719us         1  74.719us  74.719us  74.719us  _Z18julia_gpu______3827ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______382_17216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  74.239us         1  74.239us  74.239us  74.239us  _Z18julia_gpu______3617ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______361_14416CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  72.799us         1  72.799us  72.799us  72.799us  _Z18julia_gpu______3437ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______343_12016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  71.807us         1  71.807us  71.807us  71.807us  _Z18julia_gpu______3887ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______388_18016CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  71.680us         1  71.680us  71.680us  71.680us  _Z18julia_gpu______3527ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______352_13216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  71.328us         1  71.328us  71.328us  71.328us  _Z18julia_gpu______3797ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______379_16816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  67.615us         1  67.615us  67.615us  67.615us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi1EE
                    0.00%  67.360us         1  67.360us  67.360us  67.360us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi3EE
                    0.00%  66.976us         1  66.976us  66.976us  66.976us  _Z18julia_gpu______3857ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______385_17616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  66.975us         1  66.975us  66.975us  66.975us  _Z18julia_gpu______3707ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______370_15616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  66.623us         1  66.623us  66.623us  66.623us  _Z18julia_gpu______3677ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______367_15216CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  66.176us         1  66.176us  66.176us  66.176us  _Z39julia_gpu_min_neighbour_distance_kernel7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE34_gpu_min_neighbour_distance_kernel16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE13CuDeviceArrayI7Float64Li2ELi1EE11StructArrayI6SArrayIS8_ILi3EES14_Li1ELi3EELi2ES8_IS13_IS14_Li2ELi1EES13_IS14_Li2ELi1EES13_IS14_Li2ELi1EEES10_E3ValI10_1__4__16_ES17_ILi64EES17_ILi2EE
                    0.00%  65.855us         1  65.855us  65.855us  65.855us  _Z18julia_gpu______3497ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______349_12816CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13ReshapedArrayI7Float64Li3E8SubArrayIS13_Li2E13CuDeviceArrayIS13_Li3ELi1EES8_I9UnitRangeIS10_ES10_5SliceIS9_IS10_EEELifalseEES8_I27SignedMultiplicativeInverseIS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES12_IS13_Li4ES14_IS13_Li2ES15_IS13_Li3ELi1EES8_IS17_IS9_IS10_EES10_S17_IS9_IS10_EEELifalseEES8_IS18_IS10_EEES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  59.582us         2  29.791us  29.503us  30.079us  julia_broadcast_kernel_21550(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Tuple, Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
                    0.00%  43.360us         3  14.453us  3.0720us  21.536us  julia__5_14623(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, Int64)
                    0.00%  40.991us         1  40.991us  40.991us  40.991us  _Z28julia_broadcast_kernel_2067815CuKernelContext13CuDeviceArrayI7Float64Li2ELi1EE11BroadcastedI12CuArrayStyleILi2EE5TupleI5OneToI5Int64ES5_IS6_EE11_soundspeedS4_I10CuRefValueI19EulerTotalEnergyLawIS1_Li3ELi5E49_____1_4__grav___9_81__pde_level_balance___false_EE8ExtrudedI8SubArrayIS1_Li2ES0_IS1_Li3ELi1EES4_I5SliceIS5_IS6_EES6_S12_IS5_IS6_EEELifalseEES4_I4BoolS13_ES4_IS6_S6_EES10_IS0_IS1_Li2ELi1EES4_IS13_S13_ES4_IS6_S6_EEEES6_
                    0.00%  24.959us         1  24.959us  24.959us  24.959us  _Z34julia_partial_mapreduce_grid_209469_identity4_max7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EE11BroadcastedI12CuArrayStyleILi2EES3_IS4_IS5_ES4_IS5_EES_S3_IS7_IS1_Li2ELi1EEEE
                    0.00%  22.239us         3  7.4130us  5.0240us  8.6720us  _Z34julia_partial_mapreduce_grid_221299_identity4_min7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  18.432us         4  4.6080us  3.1040us  7.0400us  _Z34julia_partial_mapreduce_grid_228119_identity8_add_sum7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  13.984us         1  13.984us  13.984us  13.984us  _Z18julia_gpu______4357ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______435_250I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToIS4_ES10_IS4_ES10_IS4_ES10_IS4_EEE7NDRangeILi4ES6_S6_S8_ILi4ES9_IS10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_EEES8_ILi4ES9_IS10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_EEEEE13CuDeviceArrayIS9_IS4_S4_S4_S4_S4_S4_S4_S4_ELi4ELi1EES12_IS9_IS4_S4_S4_S4_ELi3ELi1EES10_IS4_ES10_IS4_ES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  11.680us         5  2.3360us  1.7920us  2.6880us  julia__9_4108(CuKernelContext, CuDeviceArray<Float64, int=1, int=1>, CuDeviceArray<Float64, int=1, int=1>, Int64, CuDeviceArray<Float64, int=1, int=1>)
                    0.00%  11.007us         4  2.7510us  2.4000us  3.0720us  julia_getindex_kernel_7442(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, Slice<OneTo<Int64>>, Int64, CuDeviceArray<Int64<Tuple<Int64>>, int=5, int=1>, Int64, Int64)
                    0.00%  10.784us         4  2.6960us  2.4000us  2.9440us  julia_getindex_kernel_7709(CuKernelContext, CuDeviceArray<Int64, int=2, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, StepRange<Int64, Int64>, Int64, CuDeviceArray<Int64<Int64, Int64>, int=5, int=1>, Int64, Int64)
                    0.00%  9.0880us         4  2.2720us  2.1120us  2.3360us  _Z27julia_broadcast_kernel_756815CuKernelContext8SubArrayI5Int64Li2E13CuDeviceArrayIS1_Li5ELi1EE5TupleIS1_5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES1_ELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS10_ES3_IS1_S1_EEEES1_
                    0.00%  8.9600us         1  8.9600us  8.9600us  8.9600us  _Z35julia_gpu_materializeboundaryfaces_7ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE30_gpu_materializeboundaryfaces_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES5_10StaticSizeI6_256__ES7_ILi1ES8_IS9_IS10_EEEvEE11LobattoCellI7Float645ArrayS8_ILi4ELi4ELi4EELi3ES8_I13CuDeviceArrayIS14_Li3ELi1EES15_IS14_Li3ELi1EES15_IS14_Li3ELi1EEES15_I6SArrayIS8_ILi3EES14_Li1ELi3EELi1ELi1EES8_I4KronIS8_I8DiagonalIS14_4OnesIS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EEEES17_IS8_IS18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEES17_IS8_IS15_IS14_Li2ELi1EES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEES18_IS14_S19_IS14_Li1ES8_IS9_IS10_EEEEEEES18_IS14_S15_IS14_Li1ELi1EEES18_IS14_S15_IS14_Li1ELi1EEES17_IS8_IS15_IS14_Li2ELi1EES15_IS14_Li2ELi1EES15_IS14_Li2ELi1EEEES8_IS8_IS15_IS10_Li3ELi1EEES8_IS15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EES15_IS10_Li2ELi1EEES8_IS15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EES15_IS10_Li1ELi1EEES8_IS10_S10_S10_S10_S10_S10_S10_S10_EEES15_IS10_Li2ELi1EE22GeneralSparseMatrixCSCI11PermutationILi4EES10_S15_IS21_ILi4EELi1ELi1EES15_IS10_Li1ELi1EEE
                    0.00%  8.8640us         1  8.8640us  8.8640us  8.8640us  _Z18julia_gpu______4327ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______432_24616CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi4E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi4ES5_S5_S7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi4ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayI6SArrayIS8_ILi3EE7Float64Li1ELi3EELi4ELi1EES12_IS13_IS8_ILi3EES14_Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  8.6400us         4  2.1600us  2.1120us  2.2720us  _Z27julia_broadcast_kernel_781815CuKernelContext8SubArrayI5Int64Li2E13CuDeviceArrayIS1_Li5ELi1EE5TupleI5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES1_S1_ELifalseEE11BroadcastedI12CuArrayStyleILi2EES3_IS5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li2ELi1EES3_I4BoolS10_ES3_IS1_S1_EEEES1_
                    0.00%  8.3520us         5  1.6700us  1.6640us  1.6960us  julia_setindex_kernel_4329(CuKernelContext, CuDeviceArray<Float64, int=1, int=1>, CuDeviceArray<Float64, int=1, int=1>, Tuple<Int64>, CuDeviceArray<Float64, int=1, int=1>, UnitRange<CuDeviceArray<Float64, int=1, int=1>>)
                    0.00%  7.8720us         1  7.8720us  7.8720us  7.8720us  _Z18julia_gpu______4287ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______428_24116CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_EEE7NDRangeILi3ES5_S5_S7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi3ES8_IS9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayIS8_IS10_S10_S10_S10_ELi3ELi1EES12_IS10_Li5ELi1EES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  7.7120us         3  2.5700us  2.3040us  2.7200us  julia_broadcast_kernel_3985(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<CuArrayStyle<int=3>, Tuple<OneTo<Int64>, CuArrayStyle<int=3, Tuple>, CuArrayStyle<int=3, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>>>, Tuple)
                    0.00%  7.3920us         1  7.3920us  7.3920us  7.3920us  _Z18julia_gpu______4257ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______425_23716CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi5E5TupleI5OneToI5Int64ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEE7NDRangeILi5ES5_S5_S7_ILi5ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEES7_ILi5ES8_IS9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_ES9_IS10_EEEEE13CuDeviceArrayIS10_Li5ELi1EES12_IS10_Li3ELi1EE9UnitRangeIS10_ES13_IS10_ES9_IS10_ES9_IS10_ES9_IS10_Ev4Bool
                    0.00%  6.1760us         2  3.0880us  3.0720us  3.1040us  julia_getindex_kernel_7136(CuKernelContext, CuDeviceArray<Int64, int=3, int=1>, CuDeviceArray<Int64, int=5, int=1>, Tuple<Int64, Int64, Int64, Int64, Int64>, Int64, Slice<OneTo<Int64>>, Int64, CuDeviceArray<Int64<Tuple<Int64>>, int=5, int=1>, CuDeviceArray<Int64, int=1, int=1>)
                    0.00%  5.1520us         1  5.1520us  5.1520us  5.1520us  _Z27julia_broadcast_kernel_250115CuKernelContext13CuDeviceArrayI6SArrayI5TupleILi3EE7Float64Li1ELi3EELi3ELi1EE11BroadcastedI12CuArrayStyleILi3EES2_I5OneToI5Int64ES6_IS7_ES6_IS7_EE8_328_329I26SVector_S__T__where__S__T_ES2_I8ExtrudedIS0_IS3_Li3ELi1EES2_I4BoolS10_S10_ES2_IS7_S7_S7_EES9_IS0_IS3_Li3ELi1EES2_IS10_S10_S10_ES2_IS7_S7_S7_EES9_IS0_IS3_Li3ELi1EES2_IS10_S10_S10_ES2_IS7_S7_S7_EEEES7_
                    0.00%  5.0560us         1  5.0560us  5.0560us  5.0560us  _Z34julia_partial_mapreduce_grid_211289_identity4_max7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
                    0.00%  4.7680us         2  2.3840us  2.1440us  2.6240us  _Z27julia_broadcast_kernel_728815CuKernelContext8SubArrayI5Int64Li3E13CuDeviceArrayIS1_Li5ELi1EE5TupleIS1_5SliceI5OneToIS1_EES1_S4_IS5_IS1_EES2_IS1_Li1ELi1EEELifalseEE11BroadcastedI12CuArrayStyleILi3EES3_IS5_IS1_ES5_IS1_ES5_IS1_EE9_identityS3_I8ExtrudedIS2_IS1_Li3ELi1EES3_I4BoolS10_S10_ES3_IS1_S1_S1_EEEES1_
                    0.00%  3.8720us         1  3.8720us  3.8720us  3.8720us  _Z18julia_gpu______4017ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______401_213I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  3.0720us         1  3.0720us  3.0720us  3.0720us  julia_broadcast_kernel_3846(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<CuArrayStyle<int=3>, Tuple<OneTo<Int64>, CuArrayStyle<int=3, Tuple>, CuArrayStyle<int=3, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<Bool, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=3, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple, Tuple>>>>, Tuple)
                    0.00%  2.7840us         1  2.7840us  2.7840us  2.7840us  _Z18julia_gpu______4097ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______409_221I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  2.7200us         1  2.7200us  2.7200us  2.7200us  _Z18julia_gpu______4217ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______421_233I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  2.6880us         1  2.6880us  2.6880us  2.6880us  _Z18julia_gpu______4177ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______417_229I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  2.6560us         1  2.6560us  2.6560us  2.6560us  _Z18julia_gpu______4137ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______413_225I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  2.6240us         1  2.6240us  2.6240us  2.6240us  _Z18julia_gpu______4057ContextI14__CUDACtx_Namevv14__PassType_275v12DisableHooksE17_gpu______405_217I5Int64E16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToIS4_ES10_IS4_EEE7NDRangeILi2ES6_S6_S8_ILi2ES9_IS10_IS4_ES10_IS4_EEES8_ILi2ES9_IS10_IS4_ES10_IS4_EEEEE13CuDeviceArrayI6SArrayIS9_ILi3EE7Float64Li1ELi3EELi3ELi1EES12_IS14_Li1ELi1EES10_IS4_ES10_IS4_Ev4Bool
                    0.00%  2.2720us         1  2.2720us  2.2720us  2.2720us  julia_broadcast_kernel_5370(CuKernelContext, CuDeviceArray<Float64, int=2, int=1>, Broadcasted<CuArrayStyle<int=2>, Tuple<OneTo<Int64>, CuArrayStyle<int=2, Tuple>>, __, CuArrayStyle<Extruded<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<Bool, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>, Int64<CuDeviceArray<Float64, int=2, int=1>, CuArrayStyle<OneTo<Int64>, OneTo<Int64>>, CuArrayStyle<Tuple, Tuple>>>>, Tuple)
                    0.00%  2.2400us         1  2.2400us  2.2400us  2.2400us  _Z27julia_broadcast_kernel_509815CuKernelContext13CuDeviceArrayI6SArrayI5TupleILi2EE7Float64Li1ELi2EELi2ELi1EE11BroadcastedI12CuArrayStyleILi2EES2_I5OneToI5Int64ES6_IS7_EE8_328_329I26SVector_S__T__where__S__T_ES2_I8ExtrudedIS0_IS3_Li2ELi1EES2_I4BoolS10_ES2_IS7_S7_EES9_IS0_IS3_Li2ELi1EES2_IS10_S10_ES2_IS7_S7_EEEES7_
      API calls:   29.53%  471.17ms     36001  13.087us  4.7700us  7.3974ms  cuLaunchKernel
                   15.65%  249.79ms         1  249.79ms  249.79ms  249.79ms  cuDevicePrimaryCtxRetain
                   12.52%  199.82ms      1521  131.38us     870ns  22.490ms
                    6.87%  109.65ms    298311     367ns     120ns  6.6143ms  cuCtxGetCurrent
                    6.75%  107.75ms       101  1.0668ms  48.200us  10.687ms  cuModuleUnload
                    5.37%  85.692ms     43867  1.9530us     360ns  6.9606ms  cuStreamQuery
                    4.83%  77.127ms     10511  7.3370us  4.3000us  8.5540ms
                    4.31%  68.798ms       101  681.17us  174.11us  19.695ms  cuModuleLoadDataEx
                    3.09%  49.358ms     37735  1.3080us     570ns  6.9325ms  cuStreamWaitEvent
                    2.76%  43.987ms     36302  1.2110us     560ns  6.7246ms  cuEventRecord
                    2.21%  35.189ms     36302     969ns     310ns  5.8710ms  cuEventCreate
                    1.88%  30.049ms     36302     827ns     270ns  6.3371ms  cuEventDestroy
                    1.50%  23.995ms      8888  2.6990us     290ns  9.0255ms  cuOccupancyMaxPotentialBlockSize
                    0.75%  11.967ms        16  747.94us  16.720us  2.1895ms  cuMemcpyDtoHAsync
                    0.73%  11.601ms     10517  1.1030us     600ns  131.63us  cuEventQuery
                    0.34%  5.4689ms        53  103.19us  4.8000us  862.50us  cuMemcpyHtoDAsync
                    0.33%  5.2588ms       101  52.067us  13.370us  1.7408ms  cuMemHostAlloc
                    0.25%  3.9332ms      1521  2.5850us     940ns  330.90us
                    0.14%  2.1875ms       202  10.829us  1.2200us  33.500us  cuCtxSynchronize
                    0.06%  964.46us         2  482.23us     220ns  964.24us
                    0.06%  923.65us        18  51.313us  2.2700us  619.07us  cuStreamCreate
                    0.02%  257.48us      1525     168ns     130ns  6.0000us  cuCtxGetDevice
                    0.01%  190.90us        69  2.7660us     310ns  18.540us  cuPointerGetAttribute
                    0.01%  175.74us        18  9.7630us  3.0700us  35.850us  cuStreamDestroy
                    0.01%  160.19us       101  1.5860us     740ns  4.2100us  cuModuleGetFunction
                    0.01%  104.41us       101  1.0330us     520ns  8.9300us  cuMemHostGetDevicePointer
                    0.01%  86.370us        16  5.3980us  2.3800us  9.9100us  cuStreamSynchronize
                    0.00%  76.790us       374     205ns     120ns  4.4700us  cuDeviceGetAttribute
                    0.00%  12.370us         6  2.0610us  1.6600us  3.1700us  cuCtxSetCurrent
                    0.00%  11.430us        14     816ns     170ns  1.6000us  cuDeviceGetCount
                    0.00%  4.2600us        11     387ns     120ns  1.3900us  cuDriverGetVersion
                    0.00%  3.2300us         1  3.2300us  3.2300us  3.2300us
                    0.00%  1.4000us         3     466ns     190ns     630ns  cuDeviceGet
======== Error: Application returned non-zero code 1
[lwilcox@compute-8-21 Atum.jl]$
[lwilcox@compute-8-21 Atum.jl]$
[lwilcox@compute-8-21 Atum.jl]$

Next

nvprof --gpu-trace julia-1.6 --project experiment/euler_gravity/held_suarez_deep.jl | tee perf.log

gives a timeline of kernel calls which can be useful

ClimateMachine 0.01 day of HS

I didn't make any changes. Running

mpirun -np 1 nvprof julia-1.5 --project heldsuarez/hs_quickcheck.jl

gives

==8644== Profiling application: julia-1.5 --project heldsuarez/hs_quickcheck.jl
==8644== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   21.63%  2.05680s      1104  1.8630ms  1.8172ms  1.9143ms  _Z30julia_gpu_band_forward_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES6_IS7_EEE7NDRangeILi2ES2_10StaticSizeI6_4__4_ES4_ILi2ES5_IS6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE25_gpu_band_forward_kernel_13CuDeviceArrayI7Float64Li3ELi1EE20DGColumnBandedMatrixILi3E9_3__3__3_Li5ELi1350ELi15ELi1ELifalseES13_IS14_Li5ELi1EEE
                   21.12%  2.00771s      1104  1.8186ms  1.8010ms  1.8556ms  _Z27julia_gpu_band_back_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES6_IS7_EEE7NDRangeILi2ES2_10StaticSizeI6_4__4_ES4_ILi2ES5_IS6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE22_gpu_band_back_kernel_13CuDeviceArrayI7Float64Li3ELi1EE20DGColumnBandedMatrixILi3E9_3__3__3_Li5ELi1350ELi15ELi1ELifalseES13_IS14_Li5ELi1EEE
                   14.78%  1.40509s      1656  848.48us  832.95us  857.62us  _Z35julia_gpu_dgsem_interface_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI5_16__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE30_gpu_dgsem_interface_tendency_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_S5_I17DeepShellCoriolis7Gravity17HeldSuarezForcingIS16_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S5_I7Float64S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S20_S20_S20_EEEEES5_I9DefaultBCS21_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS20_EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S7_EEE3ValI277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E19HorizontalDirection11RefanovFluxv13CuDeviceArrayIS20_Li3ELi1EES30_IS20_Li3ELi1EEvvS30_IS20_Li3ELi1EES30_IS20_Li3ELi1EES30_IS20_Li4ELi1EES20_S30_IS7_Li3ELi1EES30_IS7_Li3ELi1EES30_IS7_Li2ELi1EES30_IS7_Li1ELi1EE4Bool
                   10.74%  1.02110s      1656  616.61us  594.71us  768.09us  _Z31julia_gpu_esdg_volume_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE26_gpu_esdg_volume_tendency_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_S5_I17DeepShellCoriolis7Gravity17HeldSuarezForcingIS16_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S5_I7Float64S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S20_S20_S20_EEEEES5_I9DefaultBCS21_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS20_EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S7_EEE3ValILi1EES27_I277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E12KGVolumeFlux13CuDeviceArrayIS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li2ELi1EE4BoolS30_S30_
                    5.95%  565.30ms      1656  341.37us  334.62us  407.93us  _Z31julia_gpu_esdg_volume_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE26_gpu_esdg_volume_tendency_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_S5_I17DeepShellCoriolis7Gravity17HeldSuarezForcingIS16_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S5_I7Float64S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S20_S20_S20_EEEEES5_I9DefaultBCS21_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS20_EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S7_EEE3ValILi3EES27_I277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E12KGVolumeFlux13CuDeviceArrayIS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li2ELi1EE4BoolS30_
                    5.89%  559.64ms      1656  337.95us  331.96us  410.43us  _Z31julia_gpu_esdg_volume_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE26_gpu_esdg_volume_tendency_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_S5_I17DeepShellCoriolis7Gravity17HeldSuarezForcingIS16_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S5_I7Float64S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S20_S20_S20_EEEEES5_I9DefaultBCS21_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS20_EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S7_EEE3ValILi2EES27_I277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E12KGVolumeFlux13CuDeviceArrayIS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li3ELi1EES29_IS20_Li2ELi1EE4BoolS30_
                    4.28%  407.27ms      1656  245.94us  239.17us  270.78us  _Z35julia_gpu_dgsem_interface_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI5_16__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE30_gpu_dgsem_interface_tendency_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_S5_I17DeepShellCoriolis7Gravity17HeldSuarezForcingIS16_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S5_I7Float64S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S20_S20_S20_EEEEES5_I9DefaultBCS21_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS20_EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S7_S20_S7_EEE3ValI277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E17VerticalDirection11RefanovFluxv13CuDeviceArrayIS20_Li3ELi1EES30_IS20_Li3ELi1EEvvS30_IS20_Li3ELi1EES30_IS20_Li3ELi1EES30_IS20_Li4ELi1EES20_S30_IS7_Li3ELi1EES30_IS7_Li3ELi1EES30_IS7_Li2ELi1EES30_IS7_Li1ELi1EE4Bool
                    2.63%  249.74ms       552  452.43us  449.47us  455.00us  _Z23julia_gpu_stage_update_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI6_256__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE18_gpu_stage_update_17LowStorageVariant13CuDeviceArrayI7Float64Li3ELi1EES5_IS14_IS15_Li3ELi1EES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EEES5_IS14_IS15_Li3ELi1EES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EEES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EE6SArrayIS5_ILi3ELi3EES15_Li2ELi9EES16_IS5_ILi3ELi3EES15_Li2ELi9EES15_3ValILi3EES17_ILifalseEEvv
                    2.33%  221.39ms      1104  200.53us  198.69us  203.58us  julia_broadcast_kernel_23986(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<void, Tuple<OneTo<Int64>, Tuple<OneTo>, Tuple<OneTo>>, __, Broadcasted<Extruded<CuDeviceArray<Float64, int=3, int=1>, Broadcasted<Bool, Tuple<OneTo>, Tuple<OneTo>>, Broadcasted<OneTo, OneTo, OneTo>>, OneTo<Int64, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<Tuple<OneTo>, Tuple<OneTo>, Tuple<OneTo>>, Broadcasted<OneTo, OneTo, OneTo>>>>, OneTo)
                    2.00%  190.50ms        12  15.875ms  1.9200us  40.353ms  [CUDA memcpy DtoH]
                    1.95%  185.44ms       552  335.94us  332.19us  338.43us  _Z23julia_gpu_stage_update_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI6_256__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE18_gpu_stage_update_17LowStorageVariant13CuDeviceArrayI7Float64Li3ELi1EES5_IS14_IS15_Li3ELi1EES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EEES5_IS14_IS15_Li3ELi1EES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EEES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EE6SArrayIS5_ILi3ELi3EES15_Li2ELi9EES16_IS5_ILi3ELi3EES15_Li2ELi9EES15_3ValILi2EES17_ILifalseEEvv
                    1.94%  184.39ms       552  334.04us  329.85us  337.28us  _Z26julia_gpu_solution_update_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI6_256__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE21_gpu_solution_update_17LowStorageVariant13CuDeviceArrayI7Float64Li3ELi1EES5_IS14_IS15_Li3ELi1EES14_IS15_Li3ELi1EES14_IS15_Li3ELi1EEE6SArrayIS5_ILi3EES15_Li1ELi3EES15_3ValILi3EEvvv
                    1.58%  150.08ms      1104  135.94us  134.82us  137.31us  julia_cartesian_copy_kernelNOT__23810(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, CartesianIndex<int=3>, CuDeviceArray<Float64, int=3, int=1>, CuDeviceArray<Float64, int=3, int=1, int=3>, Tuple<Int64, CartesianIndex<int=3>, CartesianIndex<int=3>>, CartesianIndex<int=3>)
                    0.86%  81.918ms         1  81.918ms  81.918ms  81.918ms  _Z25julia_gpu_band_lu_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES6_IS7_EEE7NDRangeILi2ES2_10StaticSizeI6_4__4_ES4_ILi2ES5_IS6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE20_gpu_band_lu_kernel_20DGColumnBandedMatrixILi3E9_3__3__3_Li5ELi1350ELi15ELi1ELifalseE13CuDeviceArrayI7Float64Li5ELi1EEE
                    0.84%  79.921ms        78  1.0246ms  1.5040us  21.619ms  [CUDA memcpy HtoD]
                    0.82%  77.656ms        60  1.2943ms  1.2708ms  1.3092ms  _Z31julia_gpu_esdg_volume_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE26_gpu_esdg_volume_tendency_61VeryLinearThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI11__gravity__S5_I7GravityEES5_I9DefaultBCS18_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileI7Float64EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S7_S24_S7_EEE3ValILi3EES25_I277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E22VeryLinearKGVolumeFlux13CuDeviceArrayIS24_Li3ELi1EES27_IS24_Li3ELi1EES27_IS24_Li3ELi1EES27_IS24_Li3ELi1EES27_IS24_Li2ELi1EE4BoolS28_
                    0.40%  38.319ms        60  638.66us  628.95us  646.33us  _Z35julia_gpu_dgsem_interface_tendency_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI5_16__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE30_gpu_dgsem_interface_tendency_61VeryLinearThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI11__gravity__S5_I7GravityEES5_I9DefaultBCS18_ES16_I14_______u____e_S5_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileI7Float64EES16_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S5_IS24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S24_S7_S24_S7_EEE3ValI277_dim___3__N____3__3__3___Nq____4__4__4___Nqh___16__Nqk___4__Nfp_v___16__Nfp_h___16__Np___64__nface___6__ninteriorelem___20250__nexteriorelem___0__nelem___20250__nvertelem___15__nhorzelem___1350__nrealelem___20250__nhorzrealelem___1350__device___KernelAbstractions_CUDADevice___E17VerticalDirection12RoefanovFluxv13CuDeviceArrayIS24_Li3ELi1EES28_IS24_Li3ELi1EEvvS28_IS24_Li3ELi1EES28_IS24_Li3ELi1EES28_IS24_Li4ELi1EES24_S28_IS7_Li3ELi1EES28_IS7_Li3ELi1EES28_IS7_Li2ELi1EES28_IS7_Li1ELi1EE4Bool
                    0.13%  11.994ms        60  199.90us  198.85us  201.98us  julia_broadcast_kernel_15527(CuKernelContext, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<void, Tuple<OneTo<Int64>, Tuple<OneTo>, Tuple<OneTo>>, __, Broadcasted<Extruded<CuDeviceArray<Float64, int=3, int=1>, Broadcasted<Bool, Tuple<OneTo>, Tuple<OneTo>>, Broadcasted<OneTo, OneTo, OneTo>>, CuDeviceArray<Float64, int=3, int=1, CuArrayStyle<int=3>, void, Int64, Broadcasted<Float64, OneTo<Int64, CuDeviceArray<Float64, int=3, int=1>, Broadcasted<Tuple<OneTo>, Tuple<OneTo>, Tuple<OneTo>>, Broadcasted<OneTo, OneTo, OneTo>>>>>>, OneTo)
                    0.06%  6.0869ms        60  101.45us  95.935us  107.84us  _Z35julia_gpu_kernel_set_banded_matrix_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE30_gpu_kernel_set_banded_matrix_20DGColumnBandedMatrixILi3E9_3__3__3_Li5ELi1350ELi15ELi1ELifalseE13CuDeviceArrayI7Float64Li5ELi1EEES14_IS15_Li3ELi1EES7_S7_S7_9UnitRangeIS7_ES16_IS7_E
                    0.04%  3.6084ms        60  60.139us  59.552us  61.056us  _Z33julia_gpu_kernel_set_banded_data_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi3E5TupleI5OneToI5Int64ES6_IS7_ES6_IS7_EEE7NDRangeILi3ES2_10StaticSizeI9_4__4__4_ES4_ILi3ES5_IS6_IS7_ES6_IS7_ES6_IS7_EEEvEEv14__PassType_253v12DisableHooksE28_gpu_kernel_set_banded_data_13CuDeviceArrayI7Float64Li3ELi1EE20DGColumnBandedMatrixILi3E9_3__3__3_Li5ELi1350ELi15ELi1ELifalseES13_IS14_Li5ELi1EEES7_S7_S7_9UnitRangeIS7_ES16_IS7_E
                    0.02%  2.2566ms         1  2.2566ms  2.2566ms  2.2566ms  julia_YY_4_11373(CuKernelContext, CuDeviceArray<Float64, int=5, int=1>, Float64)
                    0.00%  203.33us         1  203.33us  203.33us  203.33us  _Z44julia_gpu_kernel_nodal_init_state_auxiliary_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI10_1296000__E12DynamicCheckvv7NDRangeILi1ES2_I8_20250__ES2_I5_64__EvvEEv14__PassType_253v12DisableHooksE39_gpu_kernel_nodal_init_state_auxiliary_51ThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI31__coriolis___gravity___forcing_5TupleI17DeepShellCoriolis7Gravity17HeldSuarezForcingIS11_I98__k_a___k_f___k_s____T_y______z___T_equator___T_min_____b___R_d___day___grav___cp_d___cv_d___MSLP_S12_I7Float64S16_S16_S16_S16_S16_S16_S16_S16_5Int64S16_S16_S16_S16_EEEEES12_I9DefaultBCS18_ES11_I14_______u____e_S12_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileIS16_EES11_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S12_IS16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S16_S17_S16_S17_EEE3ValILi3EES24_I9_3__3__3_E28_nodal_init_state_auxiliary_13CuDeviceArrayIS16_Li3ELi1EEvS24_IS11_I2__S12_EES26_IS16_Li3ELi1EE9UnitRangeIS17_E
                    0.00%  199.84us         1  199.84us  199.84us  199.84us  _Z44julia_gpu_kernel_nodal_init_state_auxiliary_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI10_1296000__E12DynamicCheckvv7NDRangeILi1ES2_I8_20250__ES2_I5_64__EvvEEv14__PassType_253v12DisableHooksE39_gpu_kernel_nodal_init_state_auxiliary_61VeryLinearThreeDimensionalDryCompressibleEulerWithTotalEnergyI20SphericalOrientation11DryIdealGas10NamedTupleI11__gravity__5TupleI7GravityEES12_I9DefaultBCS14_ES11_I14_______u____e_S12_I7_______8__u_____8__e_____EE17DryReferenceStateI26DecayingTemperatureProfileI7Float64EES11_I122__a_______g_______R_d___cv_d___cp_d_______H___p____k_______T_0___T_E___T_P___b___z_t_____c_____c___V_p___day___p0___T_ref_S12_IS20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_S20_5Int64S20_S21_EEE3ValILi3EES22_I9_3__3__3_E28_nodal_init_state_auxiliary_13CuDeviceArrayIS20_Li3ELi1EEvS22_IS11_I2__S12_EES24_IS20_Li3ELi1EE9UnitRangeIS21_E
                    0.00%  199.68us         2  99.839us  99.615us  100.06us  _Z39julia_gpu_kernel_min_neighbor_distance_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI5_64__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE34_gpu_kernel_min_neighbor_distance_3ValI9_3__3__3_ES13_ILi3EE19HorizontalDirection13CuDeviceArrayI7Float64Li2ELi1EES15_IS16_Li3ELi1EE9UnitRangeIS7_E
                    0.00%  128.29us         2  64.143us  63.360us  64.927us  _Z39julia_gpu_kernel_min_neighbor_distance_7ContextI14__CUDACtx_Name16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES2_10StaticSizeI5_64__ES4_ILi1ES5_IS6_IS7_EEEvEEv14__PassType_253v12DisableHooksE34_gpu_kernel_min_neighbor_distance_3ValI9_3__3__3_ES13_ILi3EE17VerticalDirection13CuDeviceArrayI7Float64Li2ELi1EES15_IS16_Li3ELi1EE9UnitRangeIS7_E
                    0.00%  114.34us         4  28.583us  27.744us  29.984us  _Z33julia_partial_mapreduce_grid_81799_identity4_min7Float6416CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li3ELi1EES7_IS1_Li2ELi1EE
                    0.00%  23.007us         4  5.7510us  4.0320us  9.1840us  _Z33julia_partial_mapreduce_grid_82599_identity4_min7Float6416CartesianIndicesILi3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEES2_ILi3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEE3ValILitrueEE13CuDeviceArrayIS1_Li4ELi1EES7_IS1_Li3ELi1EE
      API calls:   49.77%  1.71582s   3519781     487ns     429ns  2.6817ms  cuEventQuery
                   23.00%  793.10ms        25  31.724ms  2.1859ms  269.61ms  cuModuleLoadDataEx
                    7.71%  265.73ms         1  265.73ms  265.73ms  265.73ms  cuDevicePrimaryCtxRetain
                    5.59%  192.59ms        12  16.049ms  40.390us  40.490ms  cuMemcpyDtoH
                    4.22%  145.36ms       120  1.2114ms  2.2200us  71.555ms  cuStreamCreate
                    3.81%  131.40ms     14668  8.9580us  5.4600us  2.6418ms  cuLaunchKernel
                    2.41%  83.194ms        78  1.0666ms  5.0200us  21.670ms  cuMemcpyHtoD
                    0.85%  29.136ms        68  428.48us  2.6400us  2.2620ms  cuMemAlloc
                    0.76%  26.268ms     22074  1.1890us     360ns  259.02us  cuStreamQuery
                    0.37%  12.720ms     18096     702ns     430ns  9.3700us  cuStreamWaitEvent
                    0.30%  10.352ms     18099     571ns     380ns  1.2507ms  cuEventRecord
                    0.29%  9.8954ms     18099     546ns     320ns  275.27us  cuEventCreate
                    0.27%  9.2652ms        25  370.61us  36.949us  1.7978ms  cuModuleUnload
                    0.23%  7.9938ms         3  2.6646ms  1.9871ms  3.1016ms  cuMemFree
                    0.21%  7.0698ms     18099     390ns     210ns  787.77us  cuEventDestroy
                    0.09%  3.1354ms     18315     171ns     140ns  10.450us  cuCtxGetCurrent
                    0.07%  2.2889ms        25  91.554us  22.800us  1.0901ms  cuMemHostAlloc
                    0.04%  1.5210ms      2277     667ns     300ns  4.4500us  cuOccupancyMaxPotentialBlockSize
                    0.01%  468.08us       120  3.9000us  1.9700us  41.449us  cuStreamDestroy
                    0.00%  112.87us        25  4.5140us  3.1400us  10.360us  cuModuleGetGlobal
                    0.00%  88.749us       253     350ns     120ns  2.2800us  cuDeviceGetAttribute
                    0.00%  44.810us        25  1.7920us  1.0100us  2.9400us  cuModuleGetFunction
                    0.00%  29.850us        25  1.1940us     730ns  2.2100us  cuMemHostGetDevicePointer
                    0.00%  7.0800us        14     505ns     140ns  1.7300us  cuDeviceGetCount
                    0.00%  5.7900us        12     482ns     130ns  1.3700us  cuDeviceGet
                    0.00%  5.2300us         2  2.6150us  1.4600us  3.7700us  cuCtxSetCurrent
                    0.00%  4.3300us         7     618ns     140ns  1.3300us  cuDriverGetVersion
                    0.00%  2.9700us         2  1.4850us     680ns  2.2900us  cuCtxGetDevice

Setup

All the performance results are using a V100 on compute-8-21 of NPS's hamming computer. I grab this node with

srun --pty --time 03:00:00 --partition=math --gres=gpu:v100:1 /bin/bash

Atum+Bennu

git clone -b as/fieldarray git@github.com:sandreza/Atum.jl.git

where

cd Atum.jl
git rev-parse HEAD

gives

c407537075885b26ea06c3b1e7d4be2b95f59621

For Atum I am using julia v1.6.6 The command

julia-1.6 -e 'using InteractiveUtils; versioninfo()'

gives

Julia Version 1.6.6
Commit b8708f954a (2022-03-28 07:17 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: AMD EPYC 7282 16-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, znver2)
Environment:
  JULIA_MPI_BINARY = system

Next we instantiate the environment

julia-1.6 --project=. -e 'using Pkg; Pkg.instantiate()'

ClimateMachine

git clone -b fdg_hs git@github.com:CliMA/ClimaAtmos.jl.git

where

cd ClimaAtmos.jl
git rev-parse HEAD

gives

bf5b28dd36f6b21819799e90d2448b9d1ec69fdc

For ClimateMachine we are using julia v1.5 and the command

julia-1.5 -e 'using InteractiveUtils; versioninfo()'

gives

Julia Version 1.5.4
Commit 69fcb5745b (2021-03-11 19:13 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: AMD EPYC 7282 16-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, znver2)
Environment:
  JULIA_MPI_BINARY = system

Next we instantiate the environment

julia-1.5 --project=. -e 'using Pkg; Pkg.instantiate()'

Getting output

As long as I start a tmux session on hamming I can capture the text in a tmux pane with the command

tmux capture-pane -t 0.0 -J -pS -1000000 | sed -E 's# +$##g' > file.out

Tab complete can be used after -t to figure which pane to grab.

# This file is machine-generated - editing it directly is not advised
[[AbstractFFTs]]
deps = ["ChainRulesCore", "LinearAlgebra"]
git-tree-sha1 = "6f1d9bc1c08f9f4a8fa92e3ea3cb50153a1b40d4"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.1.0"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.3.3"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[ArrayInterface]]
deps = ["Compat", "IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
git-tree-sha1 = "1d6835607e9f214cb4210310868f8cf07eb0facc"
uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
version = "3.1.34"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.2.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[Bennu]]
deps = ["Adapt", "ArrayInterface", "CUDA", "CUDAKernels", "ExprTools", "FillArrays", "GPUArrays", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "SparseArrays", "StaticArrays", "StructArrays", "Tullio", "WriteVTK"]
path = "../../Bennu.jl"
uuid = "a1f7094c-8d14-434f-a772-7144e1a1c6c0"
version = "0.3.1"
[[BitTwiddlingConvenienceFunctions]]
deps = ["Static"]
git-tree-sha1 = "28bbdbf0354959db89358d1d79d421ff31ef0b5e"
uuid = "62783981-4cbd-42fc-bca8-16325de8dc4b"
version = "0.1.3"
[[CEnum]]
git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.1"
[[CPUSummary]]
deps = ["CpuId", "IfElse", "Static"]
git-tree-sha1 = "913b28a04929053e4310d0a4915f1efe195c0ce6"
uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
version = "0.1.19"
[[CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
git-tree-sha1 = "ba75320aaa092b3e17c020a2d8b9e0a572dbfa6a"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "3.9.0"
[[CUDAKernels]]
deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "f35bc730e2b0cc4400835d32cd90b7ebb1ad6e81"
uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
version = "0.3.3"
[[Cassette]]
git-tree-sha1 = "063b2e77c5537a548c5bf2f44161f1d3e1ab3227"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.3.10"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "9950387274246d08af38f6eef8cb5480862a435f"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.14.0"
[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[CloseOpenIntervals]]
deps = ["ArrayInterface", "Static"]
git-tree-sha1 = "f576084239e6bdf801007c80e27e2cc2cd963fe0"
uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9"
version = "0.1.6"
[[CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.7.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "b153278a25dd42c65abbf4e62344f9d22e59191b"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.43.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[CpuId]]
deps = ["Markdown"]
git-tree-sha1 = "32d125af0fb8ec3f8935896122c5e345709909e5"
uuid = "adafc99b-e345-5852-983c-f28acb93d879"
version = "0.3.0"
[[DataAPI]]
git-tree-sha1 = "cc70b17275652eb47bc9e5f81635981f13cea5c8"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.9.0"
[[DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[DiffRules]]
deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"]
git-tree-sha1 = "dd933c4ef7b4c270aacd4eb88fa64c147492acf0"
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
version = "1.10.0"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[ExprTools]]
git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.8"
[[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"]
git-tree-sha1 = "deed294cde3de20ae0b2e0355a6c4e1c6a5ceffc"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.12.8"
[[GPUArrays]]
deps = ["Adapt", "LLVM", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "c783e8883028bf26fb05ed4022c450ef44edd875"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.3.2"
[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "556190e1e0ea3e37d83059fc9aa576f1e2104375"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.14.1"
[[HostCPUFeatures]]
deps = ["BitTwiddlingConvenienceFunctions", "IfElse", "Libdl", "Static"]
git-tree-sha1 = "18be5268cf415b5e27f34980ed25a7d34261aa83"
uuid = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0"
version = "0.1.7"
[[Hwloc]]
deps = ["Hwloc_jll"]
git-tree-sha1 = "92d99146066c5c6888d5a3abc871e6a214388b91"
uuid = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
version = "2.0.0"
[[Hwloc_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "303d70c961317c4c20fafaf5dbe0e6d610c38542"
uuid = "e33a78d0-f292-5ffc-b300-72abe9b543c8"
version = "2.7.1+0"
[[IfElse]]
git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
version = "0.1.1"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InverseFunctions]]
deps = ["Test"]
git-tree-sha1 = "91b5dcf362c5add98049e6c29ee756910b03051d"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.3"
[[IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"
[[IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"
[[JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.4.1"
[[KernelAbstractions]]
deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
git-tree-sha1 = "cb7d8b805413025a5bc866fc036b426223ffc059"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.7.2"
[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "c9b86064be5ae0f63e50816a5a90b08c474507ae"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.9.1"
[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "5558ad3c8972d602451efe9d81c78ec14ef4f5ef"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.14+2"
[[LayoutPointers]]
deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static"]
git-tree-sha1 = "b651f573812d6c36c22c944dd66ef3ab2283dfa1"
uuid = "10f19ff3-798f-405d-979b-55457f8fc047"
version = "0.1.6"
[[LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.16.1+1"
[[LightXML]]
deps = ["Libdl", "XML2_jll"]
git-tree-sha1 = "e129d9391168c677cd4800f5c0abb1ed8cb3794f"
uuid = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
version = "0.9.0"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "a970d55c2ad8084ca317a4658ba6ce99b7523571"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.12"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[LoopVectorization]]
deps = ["ArrayInterface", "CPUSummary", "CloseOpenIntervals", "DocStringExtensions", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "Requires", "SLEEFPirates", "Static", "ThreadingUtilities", "UnPack", "VectorizationBase"]
git-tree-sha1 = "c2c1a765d943267ffc01fd6a127fcb482e80f63a"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
version = "0.12.82"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.9"
[[ManualMemory]]
git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd"
uuid = "d125e4d3-2237-4719-b19c-fa641b8a4667"
version = "0.1.8"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NaNMath]]
git-tree-sha1 = "737a5957f387b17e74d4ad2f440eb330b39a62c5"
uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
version = "1.0.0"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OffsetArrays]]
deps = ["Adapt"]
git-tree-sha1 = "043017e0bdeff61cfbb7afeb558ab29536bbb5ed"
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
version = "1.10.8"
[[OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
[[OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"
[[OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[PolyesterWeave]]
deps = ["BitTwiddlingConvenienceFunctions", "CPUSummary", "IfElse", "Static", "ThreadingUtilities"]
git-tree-sha1 = "7e597df97e46ffb1c8adbaddfa56908a7a20194b"
uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
version = "0.1.5"
[[Preferences]]
deps = ["TOML"]
git-tree-sha1 = "d3538e7f8a790dc8903519090857ef8e1283eecd"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.2.5"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "afeacaecf4ed1649555a19cb2cad3c141bbc9474"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.5.0"
[[RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[SIMDTypes]]
git-tree-sha1 = "330289636fb8107c5f32088d2741e9fd7a061a5c"
uuid = "94e857df-77ce-4151-89e5-788b33177be4"
version = "0.1.0"
[[SLEEFPirates]]
deps = ["IfElse", "Static", "VectorizationBase"]
git-tree-sha1 = "ac399b5b163b9140f9c310dfe9e9aaa225617ff6"
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
version = "0.6.32"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "5ba658aeecaaf96923dce0da9e703bd1fe7666f9"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "2.1.4"
[[Static]]
deps = ["IfElse"]
git-tree-sha1 = "a8f30abc7c64a39d389680b74e749cf33f872a70"
uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
version = "0.3.3"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "4f6ec5d99a28e1a749559ef7dd518663c5eca3d5"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.4.3"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[StructArrays]]
deps = ["Adapt", "DataAPI", "StaticArrays", "Tables"]
git-tree-sha1 = "d21f2c564b21a202f4677c0fba5b5ee431058544"
uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
version = "0.6.4"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"
[[Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
git-tree-sha1 = "5ce79ce186cc678bbb5c5681ca3379d1ddae11a1"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.7.0"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[ThreadingUtilities]]
deps = ["ManualMemory"]
git-tree-sha1 = "884539ba8c4584a3a8173cb4ee7b61049955b79c"
uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5"
version = "0.4.7"
[[TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "d60b0c96a16aaa42138d5d38ad386df672cb8bd8"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.16"
[[TranscodingStreams]]
deps = ["Random", "Test"]
git-tree-sha1 = "216b95ea110b5972db65aa90f88d8d89dcb8851c"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.9.6"
[[Tullio]]
deps = ["ChainRulesCore", "DiffRules", "LinearAlgebra", "Requires"]
git-tree-sha1 = "859e2e9a7222553a0c052e423557cedb49376da9"
uuid = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
version = "0.3.4"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[UnPack]]
git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
version = "1.0.2"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[VectorizationBase]]
deps = ["ArrayInterface", "CPUSummary", "HostCPUFeatures", "Hwloc", "IfElse", "LayoutPointers", "Libdl", "LinearAlgebra", "SIMDTypes", "Static"]
git-tree-sha1 = "9d1b533f597d87ce9b4abd36a2ce4664f08e08ed"
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
version = "0.21.29"
[[WriteVTK]]
deps = ["Base64", "CodecZlib", "FillArrays", "LightXML", "TranscodingStreams"]
git-tree-sha1 = "bff2f6b5ff1e60d89ae2deba51500ce80014f8f6"
uuid = "64499a7a-5c06-52f2-abe2-ccb03c286192"
version = "1.14.2"
[[XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.9.12+0"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
#
# Start a benchmark code to profile the banded matrix operations
#
using Bennu
using Random
using StructArrays
using LinearAlgebra
using StaticArrays
using KernelAbstractions
using CUDA
let
Nfields = 5
Nq = (4, 4, 4)
Nqh = prod(Nq[1:end-1])
Nqv = Nq[end]
Neh = 1350
#Neh = 130
Nev = 15
kl = Nqv * Nfields
ku = Nqv * Nfields
n = Nfields * Nev * Nqv
# T, AT = (Float64, Array)
T, AT = (Float64, CuArray)
rng = MersenneTwister(777)
width = ku + kl + 1
# Banded matrices
h_A = zeros(T, Nqh, width, n, Neh)
# Banded matrix factors
h_D = zeros(T, Nqh, width, n, Neh)
# RHS and solution vector
h_x = zeros(T, Nqh, n, Neh)
h_b = zeros(T, Nqh, n, Neh)
# Loop through and set columns
@inbounds for eh = 1:Neh, ij = 1:Nqh
# Create some random factors
U = diagm(0=>ones(T, n), ntuple(k -> k-1=>rand(rng, T, n+1-k) / k, ku + 1)...)
L = diagm(0=>ones(T, n), ntuple(k -> -k=>rand(rng, T, n-k) / k, kl)...)
C = L * U
D = L + U - I
# Store matrices in banded form
for k = -ku:kl
h_A[ij, k + ku + 1, max(1, 1-k):min(n-k, n), eh] = diag(C, -k)
h_D[ij, k + ku + 1, max(1, 1-k):min(n-k, n), eh] = diag(D, -k)
end
# Create column solution and RHS vectors
h_x[ij, :, eh] = rand(rng, T, n)
h_b[ij, :, eh] = C * h_x[ij, :, eh]
end
@info "A and LU on host"
@assert kl == ku
d_A = AT(h_A)
h_A = nothing
d_LU = batchedbandedlu!(d_A)
d_A = nothing
@assert Array(parent(d_LU)) ≈ h_D
h_D = nothing
GC.gc()
GC.gc()
GC.gc()
GC.gc()
@info "A and LU on device"
d_b = AT(h_b)
d_x = similar(d_b)
d_y = similar(d_b)
# make sure this all works with fieldarray types too
fld_b = fieldarray(undef, SVector{Nfields, T}, AT, (Nq..., Nev * Neh))
# Since get_batched_array is a ReshapedArray, the copy! functions don't
# work on the GPU, but the parent is a PermutedDimsArray array which
# works fine...
copyto!(parent(Bennu.get_batched_array(fld_b, Nqh, Neh)), 1, d_b, 1)
fld_x = fieldarray(undef, SVector{Nfields, T}, AT, (Nq..., Nev * Neh))
ldiv!(fld_x, d_LU, fld_b)
d_LU = nothing
GC.gc()
GC.gc()
GC.gc()
GC.gc()
@info "ldiv! done"
# Since get_batched_array is a ReshapedArray, the copy! functions don't
# work on the GPU, but the parent is a PermutedDimsArray array which
# works fine...
fld_xa = copyto!(Array{T}(undef, Nqh, n, Neh), 1, parent(Bennu.get_batched_array(fld_x, Nqh, Neh)), 1)
@assert fld_xa ≈ reshape(h_x, Nqh, n, Neh)
@info "ldiv! checked"
function rhs!(y, x, in_event)
event = in_event isa Event ? in_event : Event(Bennu.device(y))
wait(event)
y .= x
event = Event(Bennu.device(y))
return in_event isa Event ? event : y
end
d_A = Bennu.batchedbandedmatrix(rhs!, d_y, d_x, kl, ku, Nqh+max(ku, kl))
@info "Built banded"
nothing
end
[deps]
Bennu = "a1f7094c-8d14-434f-a772-7144e1a1c6c0"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment