Created
June 10, 2025 22:00
-
-
Save shunting314/3e86692884f446bb8714d6b8c83e4079 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"schemaVersion": 1, | |
"deviceProperties": [ | |
{ | |
"id": 0, "name": "NVIDIA H100", "totalGlobalMem": 102010781696, | |
"computeMajor": 9, "computeMinor": 0, | |
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, | |
"regsPerBlock": 65536, "warpSize": 32, | |
"sharedMemPerBlock": 49152, "numSms": 132 | |
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472 | |
} | |
], | |
"cupti_version": 24, | |
"cuda_runtime_version": 12060, | |
"cuda_driver_version": 12020, | |
"trace_id": "B34F20FE8F5E46E7896FC32BAFC99ABC", | |
"displayTimeUnit": "ms", | |
"baseTimeNanoseconds": 1743521598000000000, | |
"traceEvents": [ | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193020828.082, "dur": 546.544, | |
"args": { | |
"External id": 513,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 0 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193020842.103, "dur": 499.112, | |
"args": { | |
"External id": 514,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1, "pid": 2337800, "tid": 2340515, "ts": 6071193020842.103, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021150.788, "dur": 76.175, | |
"args": { | |
"External id": 515,"Record function id": 0, "Ev Idx": 2 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021242.066, "dur": 13.891, | |
"args": { | |
"External id": 516,"Record function id": 0, "Ev Idx": 3 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021284.670, "dur": 25.589, | |
"args": { | |
"External id": 517,"Record function id": 0, "Ev Idx": 4 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021395.046, "dur": 27.652, | |
"args": { | |
"External id": 518,"Record function id": 0, "Ev Idx": 5 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021398.151, "dur": 22.033, | |
"args": { | |
"External id": 519,"Record function id": 0, "Ev Idx": 6 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021407.796, "dur": 10.846, | |
"args": { | |
"External id": 520,"Record function id": 0, "Ev Idx": 7 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021410.149, "dur": 8.333, | |
"args": { | |
"External id": 521,"Record function id": 0, "Ev Idx": 8 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021427.676, "dur": 4.216, | |
"args": { | |
"External id": 522,"Record function id": 0, "Ev Idx": 9 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021429.148, "dur": 1.973, | |
"args": { | |
"External id": 523,"Record function id": 0, "Ev Idx": 10 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021429.809, "dur": 0.961, | |
"args": { | |
"External id": 524,"Record function id": 0, "Ev Idx": 11 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021430.039, "dur": 0.611, | |
"args": { | |
"External id": 525,"Record function id": 0, "Ev Idx": 12 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021436.669, "dur": 80152.041, | |
"args": { | |
"External id": 526,"Record function id": 0, "Ev Idx": 13 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021441.957, "dur": 86.551, | |
"args": { | |
"External id": 527,"Record function id": 0, "Ev Idx": 14 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021450.019, "dur": 24.497, | |
"args": { | |
"External id": 528,"Record function id": 0, "Ev Idx": 15 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021454.196, "dur": 19.779, | |
"args": { | |
"External id": 529,"Record function id": 0, "Ev Idx": 16 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021485.723, "dur": 42.444, | |
"args": { | |
"External id": 530,"Record function id": 0, "Ev Idx": 17 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021529.269, "dur": 80010.297, | |
"args": { | |
"External id": 531,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 18 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021533.505, "dur": 80003.157, | |
"args": { | |
"External id": 532,"Record function id": 0, "Ev Idx": 19 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021536.900, "dur": 79997.639, | |
"args": { | |
"External id": 533,"Record function id": 0, "Ev Idx": 20 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021544.382, "dur": 25.839, | |
"args": { | |
"External id": 534,"Record function id": 0, "Ev Idx": 21 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021571.072, "dur": 79951.138, | |
"args": { | |
"External id": 535,"Record function id": 0, "Ev Idx": 22 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193107960.067, "dur": 553.505, | |
"args": { | |
"External id": 536,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 1, "Ev Idx": 23 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193107968.520, "dur": 521.386, | |
"args": { | |
"External id": 537,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 1, "Ev Idx": 24 | |
} | |
}, | |
{ | |
"ph": "f", "id": 2, "pid": 2337800, "tid": 2340515, "ts": 6071193107968.520, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108318.467, "dur": 71.628, | |
"args": { | |
"External id": 538,"Record function id": 0, "Ev Idx": 25 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108404.808, "dur": 14.241, | |
"args": { | |
"External id": 539,"Record function id": 0, "Ev Idx": 26 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108446.530, "dur": 11.288, | |
"args": { | |
"External id": 540,"Record function id": 0, "Ev Idx": 27 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108532.600, "dur": 25.449, | |
"args": { | |
"External id": 541,"Record function id": 0, "Ev Idx": 28 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108535.495, "dur": 20.801, | |
"args": { | |
"External id": 542,"Record function id": 0, "Ev Idx": 29 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108545.570, "dur": 9.995, | |
"args": { | |
"External id": 543,"Record function id": 0, "Ev Idx": 30 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108547.823, "dur": 7.441, | |
"args": { | |
"External id": 544,"Record function id": 0, "Ev Idx": 31 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108562.826, "dur": 4.256, | |
"args": { | |
"External id": 545,"Record function id": 0, "Ev Idx": 32 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108564.218, "dur": 2.123, | |
"args": { | |
"External id": 546,"Record function id": 0, "Ev Idx": 33 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108564.969, "dur": 1.052, | |
"args": { | |
"External id": 547,"Record function id": 0, "Ev Idx": 34 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108565.300, "dur": 0.600, | |
"args": { | |
"External id": 548,"Record function id": 0, "Ev Idx": 35 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108571.389, "dur": 82018.265, | |
"args": { | |
"External id": 549,"Record function id": 0, "Ev Idx": 36 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108575.665, "dur": 71.889, | |
"args": { | |
"External id": 550,"Record function id": 0, "Ev Idx": 37 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108582.616, "dur": 22.163, | |
"args": { | |
"External id": 551,"Record function id": 0, "Ev Idx": 38 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108586.161, "dur": 18.087, | |
"args": { | |
"External id": 552,"Record function id": 0, "Ev Idx": 39 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108606.712, "dur": 40.541, | |
"args": { | |
"External id": 553,"Record function id": 0, "Ev Idx": 40 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108648.575, "dur": 81892.586, | |
"args": { | |
"External id": 554,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 41 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108652.251, "dur": 81885.154, | |
"args": { | |
"External id": 555,"Record function id": 0, "Ev Idx": 42 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108655.375, "dur": 81879.416, | |
"args": { | |
"External id": 556,"Record function id": 0, "Ev Idx": 43 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108661.585, "dur": 28.963, | |
"args": { | |
"External id": 557,"Record function id": 0, "Ev Idx": 44 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108691.500, "dur": 81829.570, | |
"args": { | |
"External id": 558,"Record function id": 0, "Ev Idx": 45 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196242.517, "dur": 569.128, | |
"args": { | |
"External id": 559,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 1, "Ev Idx": 46 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196250.239, "dur": 533.464, | |
"args": { | |
"External id": 560,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 1, "Ev Idx": 47 | |
} | |
}, | |
{ | |
"ph": "f", "id": 3, "pid": 2337800, "tid": 2340515, "ts": 6071193196250.239, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196607.127, "dur": 75.053, | |
"args": { | |
"External id": 561,"Record function id": 0, "Ev Idx": 48 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196695.991, "dur": 15.463, | |
"args": { | |
"External id": 562,"Record function id": 0, "Ev Idx": 49 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196738.926, "dur": 12.569, | |
"args": { | |
"External id": 563,"Record function id": 0, "Ev Idx": 50 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196831.535, "dur": 27.962, | |
"args": { | |
"External id": 564,"Record function id": 0, "Ev Idx": 51 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196835.171, "dur": 22.063, | |
"args": { | |
"External id": 565,"Record function id": 0, "Ev Idx": 52 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196846.017, "dur": 9.825, | |
"args": { | |
"External id": 566,"Record function id": 0, "Ev Idx": 53 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196848.581, "dur": 7.071, | |
"args": { | |
"External id": 567,"Record function id": 0, "Ev Idx": 54 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196865.577, "dur": 5.658, | |
"args": { | |
"External id": 568,"Record function id": 0, "Ev Idx": 55 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196867.419, "dur": 2.774, | |
"args": { | |
"External id": 569,"Record function id": 0, "Ev Idx": 56 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196868.571, "dur": 1.172, | |
"args": { | |
"External id": 570,"Record function id": 0, "Ev Idx": 57 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196868.982, "dur": 0.611, | |
"args": { | |
"External id": 571,"Record function id": 0, "Ev Idx": 58 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196877.104, "dur": 81367.083, | |
"args": { | |
"External id": 572,"Record function id": 0, "Ev Idx": 59 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196882.682, "dur": 82.334, | |
"args": { | |
"External id": 573,"Record function id": 0, "Ev Idx": 60 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196890.714, "dur": 26.240, | |
"args": { | |
"External id": 574,"Record function id": 0, "Ev Idx": 61 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196895.231, "dur": 21.042, | |
"args": { | |
"External id": 575,"Record function id": 0, "Ev Idx": 62 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196919.918, "dur": 44.738, | |
"args": { | |
"External id": 576,"Record function id": 0, "Ev Idx": 63 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196966.108, "dur": 81234.904, | |
"args": { | |
"External id": 577,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 64 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196970.244, "dur": 81228.715, | |
"args": { | |
"External id": 578,"Record function id": 0, "Ev Idx": 65 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196973.880, "dur": 81223.527, | |
"args": { | |
"External id": 579,"Record function id": 0, "Ev Idx": 66 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196980.971, "dur": 36.935, | |
"args": { | |
"External id": 580,"Record function id": 0, "Ev Idx": 67 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193197018.888, "dur": 81169.024, | |
"args": { | |
"External id": 581,"Record function id": 0, "Ev Idx": 68 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193283965.243, "dur": 573.966, | |
"args": { | |
"External id": 582,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 1, "Ev Idx": 69 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193283974.738, "dur": 537.941, | |
"args": { | |
"External id": 583,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 1, "Ev Idx": 70 | |
} | |
}, | |
{ | |
"ph": "f", "id": 4, "pid": 2337800, "tid": 2340515, "ts": 6071193283974.738, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284333.498, "dur": 75.374, | |
"args": { | |
"External id": 584,"Record function id": 0, "Ev Idx": 71 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284423.855, "dur": 14.452, | |
"args": { | |
"External id": 585,"Record function id": 0, "Ev Idx": 72 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284465.898, "dur": 14.342, | |
"args": { | |
"External id": 586,"Record function id": 0, "Ev Idx": 73 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284559.229, "dur": 29.905, | |
"args": { | |
"External id": 587,"Record function id": 0, "Ev Idx": 74 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284562.854, "dur": 23.946, | |
"args": { | |
"External id": 588,"Record function id": 0, "Ev Idx": 75 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284573.861, "dur": 11.657, | |
"args": { | |
"External id": 589,"Record function id": 0, "Ev Idx": 76 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284576.455, "dur": 8.883, | |
"args": { | |
"External id": 590,"Record function id": 0, "Ev Idx": 77 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284594.913, "dur": 5.388, | |
"args": { | |
"External id": 591,"Record function id": 0, "Ev Idx": 78 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284596.685, "dur": 2.674, | |
"args": { | |
"External id": 592,"Record function id": 0, "Ev Idx": 79 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284597.657, "dur": 1.252, | |
"args": { | |
"External id": 593,"Record function id": 0, "Ev Idx": 80 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284597.957, "dur": 0.791, | |
"args": { | |
"External id": 594,"Record function id": 0, "Ev Idx": 81 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284606.260, "dur": 79427.378, | |
"args": { | |
"External id": 595,"Record function id": 0, "Ev Idx": 82 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284612.259, "dur": 86.019, | |
"args": { | |
"External id": 596,"Record function id": 0, "Ev Idx": 83 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284619.910, "dur": 28.884, | |
"args": { | |
"External id": 597,"Record function id": 0, "Ev Idx": 84 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284624.467, "dur": 23.776, | |
"args": { | |
"External id": 598,"Record function id": 0, "Ev Idx": 85 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284651.738, "dur": 46.070, | |
"args": { | |
"External id": 599,"Record function id": 0, "Ev Idx": 86 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284699.450, "dur": 79288.820, | |
"args": { | |
"External id": 600,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 87 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284703.086, "dur": 79282.720, | |
"args": { | |
"External id": 601,"Record function id": 0, "Ev Idx": 88 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284706.851, "dur": 79276.772, | |
"args": { | |
"External id": 602,"Record function id": 0, "Ev Idx": 89 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284713.822, "dur": 36.485, | |
"args": { | |
"External id": 603,"Record function id": 0, "Ev Idx": 90 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284751.819, "dur": 79220.237, | |
"args": { | |
"External id": 604,"Record function id": 0, "Ev Idx": 91 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370507.930, "dur": 524.110, | |
"args": { | |
"External id": 605,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 1, "Ev Idx": 92 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370517.485, "dur": 489.838, | |
"args": { | |
"External id": 606,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 1, "Ev Idx": 93 | |
} | |
}, | |
{ | |
"ph": "f", "id": 5, "pid": 2337800, "tid": 2340515, "ts": 6071193370517.485, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370845.729, "dur": 65.529, | |
"args": { | |
"External id": 607,"Record function id": 0, "Ev Idx": 94 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370925.510, "dur": 13.921, | |
"args": { | |
"External id": 608,"Record function id": 0, "Ev Idx": 95 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370966.001, "dur": 12.038, | |
"args": { | |
"External id": 609,"Record function id": 0, "Ev Idx": 96 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371050.979, "dur": 24.627, | |
"args": { | |
"External id": 610,"Record function id": 0, "Ev Idx": 97 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371054.304, "dur": 19.600, | |
"args": { | |
"External id": 611,"Record function id": 0, "Ev Idx": 98 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371063.057, "dur": 9.755, | |
"args": { | |
"External id": 612,"Record function id": 0, "Ev Idx": 99 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371065.281, "dur": 7.260, | |
"args": { | |
"External id": 613,"Record function id": 0, "Ev Idx": 100 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371079.923, "dur": 4.356, | |
"args": { | |
"External id": 614,"Record function id": 0, "Ev Idx": 101 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371081.435, "dur": 2.093, | |
"args": { | |
"External id": 615,"Record function id": 0, "Ev Idx": 102 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371082.126, "dur": 1.072, | |
"args": { | |
"External id": 616,"Record function id": 0, "Ev Idx": 103 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371082.466, "dur": 0.621, | |
"args": { | |
"External id": 617,"Record function id": 0, "Ev Idx": 104 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371088.656, "dur": 81125.689, | |
"args": { | |
"External id": 618,"Record function id": 0, "Ev Idx": 105 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371093.363, "dur": 70.456, | |
"args": { | |
"External id": 619,"Record function id": 0, "Ev Idx": 106 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371100.604, "dur": 22.854, | |
"args": { | |
"External id": 620,"Record function id": 0, "Ev Idx": 107 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371104.570, "dur": 18.337, | |
"args": { | |
"External id": 621,"Record function id": 0, "Ev Idx": 108 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371125.461, "dur": 38.008, | |
"args": { | |
"External id": 622,"Record function id": 0, "Ev Idx": 109 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371164.580, "dur": 81005.348, | |
"args": { | |
"External id": 623,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 110 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371167.895, "dur": 80999.349, | |
"args": { | |
"External id": 624,"Record function id": 0, "Ev Idx": 111 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371171.090, "dur": 80994.251, | |
"args": { | |
"External id": 625,"Record function id": 0, "Ev Idx": 112 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371177.279, "dur": 34.182, | |
"args": { | |
"External id": 626,"Record function id": 0, "Ev Idx": 113 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371212.412, "dur": 80942.073, | |
"args": { | |
"External id": 627,"Record function id": 0, "Ev Idx": 114 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193017607.826, "dur": 83.917, | |
"args": { | |
"External id": 1,"Record function id": 0, "Ev Idx": 115 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193017695.438, "dur": 88723.065, | |
"args": { | |
"External id": 2,"Record function id": 0, "Ev Idx": 116 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193017718.353, "dur": 6.940, | |
"args": { | |
"External id": 3,"Record function id": 0, "Ev Idx": 117 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193017821.979, "dur": 2651.638, | |
"args": { | |
"External id": 4,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 0, "Ev Idx": 118 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1, "pid": 2337800, "tid": 2337800, "ts": 6071193017821.979, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019656.365, "dur": 66.140, | |
"args": { | |
"External id": 5,"Record function id": 0, "Ev Idx": 119 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019740.922, "dur": 14.382, | |
"args": { | |
"External id": 6,"Record function id": 0, "Ev Idx": 120 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019775.545, "dur": 15.273, | |
"args": { | |
"External id": 7,"Record function id": 0, "Ev Idx": 121 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019827.603, "dur": 56.625, | |
"args": { | |
"External id": 8,"Record function id": 0, "Ev Idx": 122 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019840.763, "dur": 38.909, | |
"args": { | |
"External id": 9,"Record function id": 0, "Ev Idx": 123 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019851.068, "dur": 22.675, | |
"args": { | |
"External id": 10,"Record function id": 0, "Ev Idx": 124 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019862.145, "dur": 11.157, | |
"args": { | |
"External id": 11,"Record function id": 0, "Ev Idx": 125 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019866.231, "dur": 4.617, | |
"args": { | |
"External id": 12,"Record function id": 0, "Ev Idx": 126 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019897.228, "dur": 3.415, | |
"args": { | |
"External id": 13,"Record function id": 0, "Ev Idx": 127 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019897.619, "dur": 2.724, | |
"args": { | |
"External id": 14,"Record function id": 0, "Ev Idx": 128 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019898.149, "dur": 1.513, | |
"args": { | |
"External id": 15,"Record function id": 0, "Ev Idx": 129 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019898.530, "dur": 0.961, | |
"args": { | |
"External id": 16,"Record function id": 0, "Ev Idx": 130 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019898.911, "dur": 0.360, | |
"args": { | |
"External id": 17,"Record function id": 0, "Ev Idx": 131 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019904.259, "dur": 2.724, | |
"args": { | |
"External id": 18,"Record function id": 0, "Ev Idx": 132 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019904.569, "dur": 2.193, | |
"args": { | |
"External id": 19,"Record function id": 0, "Ev Idx": 133 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019904.960, "dur": 1.232, | |
"args": { | |
"External id": 20,"Record function id": 0, "Ev Idx": 134 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019905.330, "dur": 0.731, | |
"args": { | |
"External id": 21,"Record function id": 0, "Ev Idx": 135 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019905.641, "dur": 0.250, | |
"args": { | |
"External id": 22,"Record function id": 0, "Ev Idx": 136 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019910.178, "dur": 2.533, | |
"args": { | |
"External id": 23,"Record function id": 0, "Ev Idx": 137 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019910.468, "dur": 1.993, | |
"args": { | |
"External id": 24,"Record function id": 0, "Ev Idx": 138 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019910.829, "dur": 1.101, | |
"args": { | |
"External id": 25,"Record function id": 0, "Ev Idx": 139 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019911.149, "dur": 0.661, | |
"args": { | |
"External id": 26,"Record function id": 0, "Ev Idx": 140 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019911.419, "dur": 0.221, | |
"args": { | |
"External id": 27,"Record function id": 0, "Ev Idx": 141 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019928.565, "dur": 14.602, | |
"args": { | |
"External id": 28,"Record function id": 0, "Ev Idx": 142 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019955.856, "dur": 11.418, | |
"args": { | |
"External id": 29,"Record function id": 0, "Ev Idx": 143 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019982.527, "dur": 10.976, | |
"args": { | |
"External id": 30,"Record function id": 0, "Ev Idx": 144 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020026.663, "dur": 91.368, | |
"args": { | |
"External id": 31,"Record function id": 0, "Ev Idx": 145 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020139.233, "dur": 22.063, | |
"args": { | |
"External id": 32,"Record function id": 0, "Ev Idx": 146 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020169.759, "dur": 32.349, | |
"args": { | |
"External id": 33,"Record function id": 0, "Ev Idx": 147 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020216.610, "dur": 14.722, | |
"args": { | |
"External id": 34,"Record function id": 0, "Ev Idx": 148 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020243.059, "dur": 86.631, | |
"args": { | |
"External id": 35,"Record function id": 0, "Ev Idx": 149 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020356.551, "dur": 14.391, | |
"args": { | |
"External id": 36,"Record function id": 0, "Ev Idx": 150 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020383.722, "dur": 13.440, | |
"args": { | |
"External id": 37,"Record function id": 0, "Ev Idx": 151 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020559.426, "dur": 46.070, | |
"args": { | |
"External id": 38,"Record function id": 0, "Ev Idx": 152 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020562.822, "dur": 16.885, | |
"args": { | |
"External id": 39,"Record function id": 0, "Ev Idx": 153 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020569.862, "dur": 9.024, | |
"args": { | |
"External id": 40,"Record function id": 0, "Ev Idx": 154 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020584.935, "dur": 20.311, | |
"args": { | |
"External id": 41,"Record function id": 0, "Ev Idx": 155 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193101675.772, "dur": 4674.498, | |
"args": { | |
"External id": 42,"Record function id": 0, "Ev Idx": 156 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193101688.501, "dur": 4658.734, | |
"args": { | |
"External id": 43,"Record function id": 0, "Ev Idx": 157 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193101699.047, "dur": 1083.914, | |
"args": { | |
"External id": 44,"Record function id": 0, "Ev Idx": 158 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106523.631, "dur": 87.332, | |
"args": { | |
"External id": 45,"Record function id": 0, "Ev Idx": 159 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106612.255, "dur": 88007.936, | |
"args": { | |
"External id": 46,"Record function id": 0, "Ev Idx": 160 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106634.038, "dur": 7.010, | |
"args": { | |
"External id": 47,"Record function id": 0, "Ev Idx": 161 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106736.943, "dur": 994.800, | |
"args": { | |
"External id": 48,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 0, "Ev Idx": 162 | |
} | |
}, | |
{ | |
"ph": "s", "id": 2, "pid": 2337800, "tid": 2337800, "ts": 6071193106736.943, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106896.774, "dur": 86.300, | |
"args": { | |
"External id": 49,"Record function id": 0, "Ev Idx": 163 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107000.660, "dur": 16.385, | |
"args": { | |
"External id": 50,"Record function id": 0, "Ev Idx": 164 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107033.630, "dur": 17.196, | |
"args": { | |
"External id": 51,"Record function id": 0, "Ev Idx": 165 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107083.335, "dur": 39.510, | |
"args": { | |
"External id": 52,"Record function id": 0, "Ev Idx": 166 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107090.746, "dur": 30.616, | |
"args": { | |
"External id": 53,"Record function id": 0, "Ev Idx": 167 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107097.567, "dur": 19.469, | |
"args": { | |
"External id": 54,"Record function id": 0, "Ev Idx": 168 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107105.478, "dur": 11.087, | |
"args": { | |
"External id": 55,"Record function id": 0, "Ev Idx": 169 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107110.095, "dur": 4.728, | |
"args": { | |
"External id": 56,"Record function id": 0, "Ev Idx": 170 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107135.013, "dur": 4.347, | |
"args": { | |
"External id": 57,"Record function id": 0, "Ev Idx": 171 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107135.544, "dur": 3.485, | |
"args": { | |
"External id": 58,"Record function id": 0, "Ev Idx": 172 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107136.105, "dur": 2.133, | |
"args": { | |
"External id": 59,"Record function id": 0, "Ev Idx": 173 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107136.605, "dur": 1.413, | |
"args": { | |
"External id": 60,"Record function id": 0, "Ev Idx": 174 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107137.096, "dur": 0.631, | |
"args": { | |
"External id": 61,"Record function id": 0, "Ev Idx": 175 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107143.296, "dur": 4.096, | |
"args": { | |
"External id": 62,"Record function id": 0, "Ev Idx": 176 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107143.776, "dur": 3.315, | |
"args": { | |
"External id": 63,"Record function id": 0, "Ev Idx": 177 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107144.267, "dur": 1.923, | |
"args": { | |
"External id": 64,"Record function id": 0, "Ev Idx": 178 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107144.758, "dur": 1.242, | |
"args": { | |
"External id": 65,"Record function id": 0, "Ev Idx": 179 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107145.248, "dur": 0.511, | |
"args": { | |
"External id": 66,"Record function id": 0, "Ev Idx": 180 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107151.207, "dur": 3.996, | |
"args": { | |
"External id": 67,"Record function id": 0, "Ev Idx": 181 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107151.718, "dur": 3.195, | |
"args": { | |
"External id": 68,"Record function id": 0, "Ev Idx": 182 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107152.199, "dur": 1.753, | |
"args": { | |
"External id": 69,"Record function id": 0, "Ev Idx": 183 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107152.680, "dur": 1.081, | |
"args": { | |
"External id": 70,"Record function id": 0, "Ev Idx": 184 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107153.050, "dur": 0.471, | |
"args": { | |
"External id": 71,"Record function id": 0, "Ev Idx": 185 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107171.638, "dur": 14.742, | |
"args": { | |
"External id": 72,"Record function id": 0, "Ev Idx": 186 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107200.472, "dur": 12.889, | |
"args": { | |
"External id": 73,"Record function id": 0, "Ev Idx": 187 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107224.388, "dur": 13.981, | |
"args": { | |
"External id": 74,"Record function id": 0, "Ev Idx": 188 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107268.985, "dur": 114.993, | |
"args": { | |
"External id": 75,"Record function id": 0, "Ev Idx": 189 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107408.085, "dur": 28.923, | |
"args": { | |
"External id": 76,"Record function id": 0, "Ev Idx": 190 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107445.591, "dur": 23.846, | |
"args": { | |
"External id": 77,"Record function id": 0, "Ev Idx": 191 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107484.810, "dur": 17.086, | |
"args": { | |
"External id": 78,"Record function id": 0, "Ev Idx": 192 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107515.096, "dur": 74.292, | |
"args": { | |
"External id": 79,"Record function id": 0, "Ev Idx": 193 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107615.187, "dur": 16.355, | |
"args": { | |
"External id": 80,"Record function id": 0, "Ev Idx": 194 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107645.212, "dur": 15.103, | |
"args": { | |
"External id": 81,"Record function id": 0, "Ev Idx": 195 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107797.772, "dur": 41.903, | |
"args": { | |
"External id": 82,"Record function id": 0, "Ev Idx": 196 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107800.917, "dur": 13.520, | |
"args": { | |
"External id": 83,"Record function id": 0, "Ev Idx": 197 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107805.103, "dur": 8.533, | |
"args": { | |
"External id": 84,"Record function id": 0, "Ev Idx": 198 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107816.611, "dur": 22.794, | |
"args": { | |
"External id": 85,"Record function id": 0, "Ev Idx": 199 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193190696.625, "dur": 3846.109, | |
"args": { | |
"External id": 86,"Record function id": 0, "Ev Idx": 200 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193190711.307, "dur": 3825.438, | |
"args": { | |
"External id": 87,"Record function id": 0, "Ev Idx": 201 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193190722.264, "dur": 998.035, | |
"args": { | |
"External id": 88,"Record function id": 0, "Ev Idx": 202 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193194731.008, "dur": 88.954, | |
"args": { | |
"External id": 89,"Record function id": 0, "Ev Idx": 203 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193194821.134, "dur": 87447.752, | |
"args": { | |
"External id": 90,"Record function id": 0, "Ev Idx": 204 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193194842.516, "dur": 5.579, | |
"args": { | |
"External id": 91,"Record function id": 0, "Ev Idx": 205 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193194945.892, "dur": 1057.334, | |
"args": { | |
"External id": 92,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 0, "Ev Idx": 206 | |
} | |
}, | |
{ | |
"ph": "s", "id": 3, "pid": 2337800, "tid": 2337800, "ts": 6071193194945.892, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195113.134, "dur": 93.872, | |
"args": { | |
"External id": 93,"Record function id": 0, "Ev Idx": 207 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195226.846, "dur": 16.655, | |
"args": { | |
"External id": 94,"Record function id": 0, "Ev Idx": 208 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195262.990, "dur": 17.627, | |
"args": { | |
"External id": 95,"Record function id": 0, "Ev Idx": 209 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195339.005, "dur": 46.880, | |
"args": { | |
"External id": 96,"Record function id": 0, "Ev Idx": 210 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195347.387, "dur": 36.185, | |
"args": { | |
"External id": 97,"Record function id": 0, "Ev Idx": 211 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195356.261, "dur": 22.564, | |
"args": { | |
"External id": 98,"Record function id": 0, "Ev Idx": 212 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195365.445, "dur": 12.919, | |
"args": { | |
"External id": 99,"Record function id": 0, "Ev Idx": 213 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195370.562, "dur": 5.188, | |
"args": { | |
"External id": 100,"Record function id": 0, "Ev Idx": 214 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195398.605, "dur": 5.117, | |
"args": { | |
"External id": 101,"Record function id": 0, "Ev Idx": 215 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195399.296, "dur": 4.096, | |
"args": { | |
"External id": 102,"Record function id": 0, "Ev Idx": 216 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195400.097, "dur": 2.384, | |
"args": { | |
"External id": 103,"Record function id": 0, "Ev Idx": 217 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195400.798, "dur": 1.442, | |
"args": { | |
"External id": 104,"Record function id": 0, "Ev Idx": 218 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195401.449, "dur": 0.511, | |
"args": { | |
"External id": 105,"Record function id": 0, "Ev Idx": 219 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195408.179, "dur": 4.287, | |
"args": { | |
"External id": 106,"Record function id": 0, "Ev Idx": 220 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195408.620, "dur": 3.525, | |
"args": { | |
"External id": 107,"Record function id": 0, "Ev Idx": 221 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195409.161, "dur": 2.033, | |
"args": { | |
"External id": 108,"Record function id": 0, "Ev Idx": 222 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195409.761, "dur": 1.232, | |
"args": { | |
"External id": 109,"Record function id": 0, "Ev Idx": 223 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195410.402, "dur": 0.351, | |
"args": { | |
"External id": 110,"Record function id": 0, "Ev Idx": 224 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195416.281, "dur": 4.397, | |
"args": { | |
"External id": 111,"Record function id": 0, "Ev Idx": 225 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195416.692, "dur": 3.696, | |
"args": { | |
"External id": 112,"Record function id": 0, "Ev Idx": 226 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195417.283, "dur": 2.313, | |
"args": { | |
"External id": 113,"Record function id": 0, "Ev Idx": 227 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195417.844, "dur": 1.552, | |
"args": { | |
"External id": 114,"Record function id": 0, "Ev Idx": 228 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195418.284, "dur": 0.872, | |
"args": { | |
"External id": 115,"Record function id": 0, "Ev Idx": 229 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195437.303, "dur": 16.134, | |
"args": { | |
"External id": 116,"Record function id": 0, "Ev Idx": 230 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195467.849, "dur": 13.130, | |
"args": { | |
"External id": 117,"Record function id": 0, "Ev Idx": 231 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195493.978, "dur": 12.169, | |
"args": { | |
"External id": 118,"Record function id": 0, "Ev Idx": 232 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195540.589, "dur": 102.584, | |
"args": { | |
"External id": 119,"Record function id": 0, "Ev Idx": 233 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195667.590, "dur": 25.889, | |
"args": { | |
"External id": 120,"Record function id": 0, "Ev Idx": 234 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195703.364, "dur": 24.467, | |
"args": { | |
"External id": 121,"Record function id": 0, "Ev Idx": 235 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195743.795, "dur": 17.096, | |
"args": { | |
"External id": 122,"Record function id": 0, "Ev Idx": 236 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195773.600, "dur": 82.114, | |
"args": { | |
"External id": 123,"Record function id": 0, "Ev Idx": 237 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195883.726, "dur": 15.744, | |
"args": { | |
"External id": 124,"Record function id": 0, "Ev Idx": 238 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195913.241, "dur": 13.730, | |
"args": { | |
"External id": 125,"Record function id": 0, "Ev Idx": 239 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196076.567, "dur": 44.106, | |
"args": { | |
"External id": 126,"Record function id": 0, "Ev Idx": 240 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196080.272, "dur": 14.172, | |
"args": { | |
"External id": 127,"Record function id": 0, "Ev Idx": 241 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196084.899, "dur": 8.894, | |
"args": { | |
"External id": 128,"Record function id": 0, "Ev Idx": 242 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196097.759, "dur": 22.634, | |
"args": { | |
"External id": 129,"Record function id": 0, "Ev Idx": 243 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193278361.244, "dur": 3831.447, | |
"args": { | |
"External id": 130,"Record function id": 0, "Ev Idx": 244 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193278375.135, "dur": 3814.511, | |
"args": { | |
"External id": 131,"Record function id": 0, "Ev Idx": 245 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193278386.191, "dur": 849.301, | |
"args": { | |
"External id": 132,"Record function id": 0, "Ev Idx": 246 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282406.223, "dur": 89.324, | |
"args": { | |
"External id": 133,"Record function id": 0, "Ev Idx": 247 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282496.819, "dur": 86416.978, | |
"args": { | |
"External id": 134,"Record function id": 0, "Ev Idx": 248 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282519.714, "dur": 6.109, | |
"args": { | |
"External id": 135,"Record function id": 0, "Ev Idx": 249 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282626.485, "dur": 1073.789, | |
"args": { | |
"External id": 136,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 0, "Ev Idx": 250 | |
} | |
}, | |
{ | |
"ph": "s", "id": 4, "pid": 2337800, "tid": 2337800, "ts": 6071193282626.485, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282794.819, "dur": 92.048, | |
"args": { | |
"External id": 137,"Record function id": 0, "Ev Idx": 251 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282905.906, "dur": 15.884, | |
"args": { | |
"External id": 138,"Record function id": 0, "Ev Idx": 252 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282939.647, "dur": 16.495, | |
"args": { | |
"External id": 139,"Record function id": 0, "Ev Idx": 253 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282992.006, "dur": 47.051, | |
"args": { | |
"External id": 140,"Record function id": 0, "Ev Idx": 254 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283000.048, "dur": 36.124, | |
"args": { | |
"External id": 141,"Record function id": 0, "Ev Idx": 255 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283008.961, "dur": 22.454, | |
"args": { | |
"External id": 142,"Record function id": 0, "Ev Idx": 256 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283017.975, "dur": 12.949, | |
"args": { | |
"External id": 143,"Record function id": 0, "Ev Idx": 257 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283022.993, "dur": 5.558, | |
"args": { | |
"External id": 144,"Record function id": 0, "Ev Idx": 258 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283050.975, "dur": 4.827, | |
"args": { | |
"External id": 145,"Record function id": 0, "Ev Idx": 259 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283051.536, "dur": 3.936, | |
"args": { | |
"External id": 146,"Record function id": 0, "Ev Idx": 260 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283052.297, "dur": 2.453, | |
"args": { | |
"External id": 147,"Record function id": 0, "Ev Idx": 261 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283052.938, "dur": 1.582, | |
"args": { | |
"External id": 148,"Record function id": 0, "Ev Idx": 262 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283053.448, "dur": 0.782, | |
"args": { | |
"External id": 149,"Record function id": 0, "Ev Idx": 263 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283059.738, "dur": 4.116, | |
"args": { | |
"External id": 150,"Record function id": 0, "Ev Idx": 264 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283060.149, "dur": 3.395, | |
"args": { | |
"External id": 151,"Record function id": 0, "Ev Idx": 265 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283060.659, "dur": 2.104, | |
"args": { | |
"External id": 152,"Record function id": 0, "Ev Idx": 266 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283061.160, "dur": 1.402, | |
"args": { | |
"External id": 153,"Record function id": 0, "Ev Idx": 267 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283061.781, "dur": 0.541, | |
"args": { | |
"External id": 154,"Record function id": 0, "Ev Idx": 268 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283067.550, "dur": 3.815, | |
"args": { | |
"External id": 155,"Record function id": 0, "Ev Idx": 269 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283067.970, "dur": 3.105, | |
"args": { | |
"External id": 156,"Record function id": 0, "Ev Idx": 270 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283068.441, "dur": 1.843, | |
"args": { | |
"External id": 157,"Record function id": 0, "Ev Idx": 271 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283068.932, "dur": 1.162, | |
"args": { | |
"External id": 158,"Record function id": 0, "Ev Idx": 272 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283069.302, "dur": 0.561, | |
"args": { | |
"External id": 159,"Record function id": 0, "Ev Idx": 273 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283088.071, "dur": 16.565, | |
"args": { | |
"External id": 160,"Record function id": 0, "Ev Idx": 274 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283119.047, "dur": 12.960, | |
"args": { | |
"External id": 161,"Record function id": 0, "Ev Idx": 275 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283143.945, "dur": 14.241, | |
"args": { | |
"External id": 162,"Record function id": 0, "Ev Idx": 276 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283195.402, "dur": 131.118, | |
"args": { | |
"External id": 163,"Record function id": 0, "Ev Idx": 277 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283353.491, "dur": 28.052, | |
"args": { | |
"External id": 164,"Record function id": 0, "Ev Idx": 278 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283391.318, "dur": 26.530, | |
"args": { | |
"External id": 165,"Record function id": 0, "Ev Idx": 279 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283434.112, "dur": 18.008, | |
"args": { | |
"External id": 166,"Record function id": 0, "Ev Idx": 280 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283465.309, "dur": 81.624, | |
"args": { | |
"External id": 167,"Record function id": 0, "Ev Idx": 281 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283574.985, "dur": 16.024, | |
"args": { | |
"External id": 168,"Record function id": 0, "Ev Idx": 282 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283605.992, "dur": 14.702, | |
"args": { | |
"External id": 169,"Record function id": 0, "Ev Idx": 283 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283784.210, "dur": 44.007, | |
"args": { | |
"External id": 170,"Record function id": 0, "Ev Idx": 284 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283787.545, "dur": 13.841, | |
"args": { | |
"External id": 171,"Record function id": 0, "Ev Idx": 285 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283791.912, "dur": 8.863, | |
"args": { | |
"External id": 172,"Record function id": 0, "Ev Idx": 286 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283804.441, "dur": 23.515, | |
"args": { | |
"External id": 173,"Record function id": 0, "Ev Idx": 287 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193364119.438, "dur": 4724.754, | |
"args": { | |
"External id": 174,"Record function id": 0, "Ev Idx": 288 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193364132.668, "dur": 4708.489, | |
"args": { | |
"External id": 175,"Record function id": 0, "Ev Idx": 289 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193364142.402, "dur": 1031.005, | |
"args": { | |
"External id": 176,"Record function id": 0, "Ev Idx": 290 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369016.151, "dur": 84.287, | |
"args": { | |
"External id": 177,"Record function id": 0, "Ev Idx": 291 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369101.740, "dur": 87260.930, | |
"args": { | |
"External id": 178,"Record function id": 0, "Ev Idx": 292 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369122.772, "dur": 5.338, | |
"args": { | |
"External id": 179,"Record function id": 0, "Ev Idx": 293 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369223.333, "dur": 1025.927, | |
"args": { | |
"External id": 180,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 0, "Ev Idx": 294 | |
} | |
}, | |
{ | |
"ph": "s", "id": 5, "pid": 2337800, "tid": 2337800, "ts": 6071193369223.333, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369406.630, "dur": 87.572, | |
"args": { | |
"External id": 181,"Record function id": 0, "Ev Idx": 295 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369512.349, "dur": 16.465, | |
"args": { | |
"External id": 182,"Record function id": 0, "Ev Idx": 296 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369546.541, "dur": 17.987, | |
"args": { | |
"External id": 183,"Record function id": 0, "Ev Idx": 297 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369596.626, "dur": 44.848, | |
"args": { | |
"External id": 184,"Record function id": 0, "Ev Idx": 298 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369604.127, "dur": 35.474, | |
"args": { | |
"External id": 185,"Record function id": 0, "Ev Idx": 299 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369612.410, "dur": 22.534, | |
"args": { | |
"External id": 186,"Record function id": 0, "Ev Idx": 300 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369621.023, "dur": 13.450, | |
"args": { | |
"External id": 187,"Record function id": 0, "Ev Idx": 301 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369626.742, "dur": 5.628, | |
"args": { | |
"External id": 188,"Record function id": 0, "Ev Idx": 302 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369654.604, "dur": 4.887, | |
"args": { | |
"External id": 189,"Record function id": 0, "Ev Idx": 303 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369655.104, "dur": 3.646, | |
"args": { | |
"External id": 190,"Record function id": 0, "Ev Idx": 304 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369655.635, "dur": 2.314, | |
"args": { | |
"External id": 191,"Record function id": 0, "Ev Idx": 305 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369656.216, "dur": 1.522, | |
"args": { | |
"External id": 192,"Record function id": 0, "Ev Idx": 306 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369657.017, "dur": 0.431, | |
"args": { | |
"External id": 193,"Record function id": 0, "Ev Idx": 307 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369663.537, "dur": 4.417, | |
"args": { | |
"External id": 194,"Record function id": 0, "Ev Idx": 308 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369664.208, "dur": 3.445, | |
"args": { | |
"External id": 195,"Record function id": 0, "Ev Idx": 309 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369664.719, "dur": 1.983, | |
"args": { | |
"External id": 196,"Record function id": 0, "Ev Idx": 310 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369665.220, "dur": 1.291, | |
"args": { | |
"External id": 197,"Record function id": 0, "Ev Idx": 311 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369665.640, "dur": 0.631, | |
"args": { | |
"External id": 198,"Record function id": 0, "Ev Idx": 312 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369671.669, "dur": 4.096, | |
"args": { | |
"External id": 199,"Record function id": 0, "Ev Idx": 313 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369672.100, "dur": 3.345, | |
"args": { | |
"External id": 200,"Record function id": 0, "Ev Idx": 314 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369672.561, "dur": 1.782, | |
"args": { | |
"External id": 201,"Record function id": 0, "Ev Idx": 315 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369673.051, "dur": 1.122, | |
"args": { | |
"External id": 202,"Record function id": 0, "Ev Idx": 316 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369673.522, "dur": 0.421, | |
"args": { | |
"External id": 203,"Record function id": 0, "Ev Idx": 317 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369693.843, "dur": 16.414, | |
"args": { | |
"External id": 204,"Record function id": 0, "Ev Idx": 318 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369725.390, "dur": 12.639, | |
"args": { | |
"External id": 205,"Record function id": 0, "Ev Idx": 319 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369751.239, "dur": 13.861, | |
"args": { | |
"External id": 206,"Record function id": 0, "Ev Idx": 320 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369797.799, "dur": 96.836, | |
"args": { | |
"External id": 207,"Record function id": 0, "Ev Idx": 321 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369917.310, "dur": 26.610, | |
"args": { | |
"External id": 208,"Record function id": 0, "Ev Idx": 322 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369953.224, "dur": 24.467, | |
"args": { | |
"External id": 209,"Record function id": 0, "Ev Idx": 323 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369994.456, "dur": 18.057, | |
"args": { | |
"External id": 210,"Record function id": 0, "Ev Idx": 324 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370024.972, "dur": 76.906, | |
"args": { | |
"External id": 211,"Record function id": 0, "Ev Idx": 325 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370129.029, "dur": 16.465, | |
"args": { | |
"External id": 212,"Record function id": 0, "Ev Idx": 326 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370159.345, "dur": 16.695, | |
"args": { | |
"External id": 213,"Record function id": 0, "Ev Idx": 327 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370344.153, "dur": 44.818, | |
"args": { | |
"External id": 214,"Record function id": 0, "Ev Idx": 328 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370347.528, "dur": 15.173, | |
"args": { | |
"External id": 215,"Record function id": 0, "Ev Idx": 329 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370351.915, "dur": 9.985, | |
"args": { | |
"External id": 216,"Record function id": 0, "Ev Idx": 330 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370365.405, "dur": 23.295, | |
"args": { | |
"External id": 217,"Record function id": 0, "Ev Idx": 331 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193452316.139, "dur": 3949.765, | |
"args": { | |
"External id": 218,"Record function id": 0, "Ev Idx": 332 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193452328.046, "dur": 3933.021, | |
"args": { | |
"External id": 219,"Record function id": 0, "Ev Idx": 333 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193452338.472, "dur": 862.471, | |
"args": { | |
"External id": 220,"Record function id": 0, "Ev Idx": 334 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "overhead", "name": "Unrecognized", "pid": -1, "tid": 0, | |
"ts": 6071193017878.845, "dur": 1677.870 | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6071193019717.866, "dur": 7.104, | |
"args": { | |
"External id": 5, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 30, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 30, "pid": 0, "tid": 7, "ts": 6071193019717.866, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019690.266, "dur": 30.726, | |
"args": { | |
"External id": 5, "cbid": 307, "correlation": 30 | |
} | |
}, | |
{ | |
"ph": "s", "id": 30, "pid": 2337800, "tid": 2337800, "ts": 6071193019690.266, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6071193019755.913, "dur": 1.632, | |
"args": { | |
"External id": 6, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 38, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 38, "pid": 0, "tid": 7, "ts": 6071193019755.913, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019748.263, "dur": 6.290, | |
"args": { | |
"External id": 6, "cbid": 307, "correlation": 38 | |
} | |
}, | |
{ | |
"ph": "s", "id": 38, "pid": 2337800, "tid": 2337800, "ts": 6071193019748.263, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193019791.785, "dur": 43.776, | |
"args": { | |
"External id": 7, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 46, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 46, "pid": 0, "tid": 7, "ts": 6071193019791.785, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019783.827, "dur": 6.390, | |
"args": { | |
"External id": 7, "cbid": 307, "correlation": 46 | |
} | |
}, | |
{ | |
"ph": "s", "id": 46, "pid": 2337800, "tid": 2337800, "ts": 6071193019783.827, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6071193019943.785, "dur": 1.344, | |
"args": { | |
"External id": 28, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 53, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 53, "pid": 0, "tid": 7, "ts": 6071193019943.785, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019935.255, "dur": 7.251, | |
"args": { | |
"External id": 28, "cbid": 307, "correlation": 53 | |
} | |
}, | |
{ | |
"ph": "s", "id": 53, "pid": 2337800, "tid": 2337800, "ts": 6071193019935.255, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6071193019968.617, "dur": 32.065, | |
"args": { | |
"External id": 29, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 60, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 60, "pid": 0, "tid": 7, "ts": 6071193019968.617, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019961.074, "dur": 5.679, | |
"args": { | |
"External id": 29, "cbid": 307, "correlation": 60 | |
} | |
}, | |
{ | |
"ph": "s", "id": 60, "pid": 2337800, "tid": 2337800, "ts": 6071193019961.074, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6071193020001.609, "dur": 1.440, | |
"args": { | |
"External id": 30, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 67, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 67, "pid": 0, "tid": 7, "ts": 6071193020001.609, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193019987.184, "dur": 5.828, | |
"args": { | |
"External id": 30, "cbid": 307, "correlation": 67 | |
} | |
}, | |
{ | |
"ph": "s", "id": 67, "pid": 2337800, "tid": 2337800, "ts": 6071193019987.184, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020103.890, "dur": 1.692, | |
"args": { | |
"External id": 31, "cbid": 200, "correlation": 82 | |
} | |
}, | |
{ | |
"ph": "f", "id": 82, "pid": 2337800, "tid": 2337800, "ts": 6071193020103.890, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193020117.641, "dur": 26803.006, | |
"args": { | |
"External id": 31, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 84, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 84, "pid": 0, "tid": 7, "ts": 6071193020117.641, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020109.328, "dur": 6.420, | |
"args": { | |
"External id": 31, "cbid": 307, "correlation": 84 | |
} | |
}, | |
{ | |
"ph": "s", "id": 84, "pid": 2337800, "tid": 2337800, "ts": 6071193020109.328, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6071193046921.607, "dur": 5544.031, | |
"args": { | |
"External id": 32, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 105, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 105, "pid": 0, "tid": 7, "ts": 6071193046921.607, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020153.344, "dur": 7.191, | |
"args": { | |
"External id": 32, "cbid": 307, "correlation": 105 | |
} | |
}, | |
{ | |
"ph": "s", "id": 105, "pid": 2337800, "tid": 2337800, "ts": 6071193020153.344, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020181.937, "dur": 0.361, | |
"args": { | |
"External id": 33, "cbid": 200, "correlation": 118 | |
} | |
}, | |
{ | |
"ph": "f", "id": 118, "pid": 2337800, "tid": 2337800, "ts": 6071193020181.937, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193052466.502, "dur": 19462.558, | |
"args": { | |
"External id": 33, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 121, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 121, "pid": 0, "tid": 7, "ts": 6071193052466.502, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020195.858, "dur": 5.398, | |
"args": { | |
"External id": 33, "cbid": 307, "correlation": 121 | |
} | |
}, | |
{ | |
"ph": "s", "id": 121, "pid": 2337800, "tid": 2337800, "ts": 6071193020195.858, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7, | |
"ts": 6071193071930.020, "dur": 1659.200, | |
"args": { | |
"External id": 34, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 132, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 132, "pid": 0, "tid": 7, "ts": 6071193071930.020, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020224.592, "dur": 6.079, | |
"args": { | |
"External id": 34, "cbid": 307, "correlation": 132 | |
} | |
}, | |
{ | |
"ph": "s", "id": 132, "pid": 2337800, "tid": 2337800, "ts": 6071193020224.592, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193073590.116, "dur": 75.072, | |
"args": { | |
"External id": 35, "device": 0, "context": 1, "stream": 7, "correlation": 139, "bytes": 77194752, "memory bandwidth (GB/s)": 1028.2762148337597 | |
} | |
}, | |
{ | |
"ph": "f", "id": 139, "pid": 0, "tid": 7, "ts": 6071193073590.116, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020270.501, "dur": 21.893, | |
"args": { | |
"External id": 35, "cbid": 41, "correlation": 139 | |
} | |
}, | |
{ | |
"ph": "s", "id": 139, "pid": 2337800, "tid": 2337800, "ts": 6071193020270.501, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020321.137, "dur": 0.551, | |
"args": { | |
"External id": 35, "cbid": 200, "correlation": 150 | |
} | |
}, | |
{ | |
"ph": "f", "id": 150, "pid": 2337800, "tid": 2337800, "ts": 6071193020321.137, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193073666.116, "dur": 5819.263, | |
"args": { | |
"External id": 35, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 153, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 153, "pid": 0, "tid": 7, "ts": 6071193073666.116, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020323.641, "dur": 4.707, | |
"args": { | |
"External id": 35, "cbid": 307, "correlation": 153 | |
} | |
}, | |
{ | |
"ph": "s", "id": 153, "pid": 2337800, "tid": 2337800, "ts": 6071193020323.641, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6071193079487.267, "dur": 2.720, | |
"args": { | |
"External id": 36, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 165, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 165, "pid": 0, "tid": 7, "ts": 6071193079487.267, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020364.653, "dur": 5.608, | |
"args": { | |
"External id": 36, "cbid": 307, "correlation": 165 | |
} | |
}, | |
{ | |
"ph": "s", "id": 165, "pid": 2337800, "tid": 2337800, "ts": 6071193020364.653, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7, | |
"ts": 6071193079490.915, "dur": 1.728, | |
"args": { | |
"External id": 37, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 170, "pid": 0, "tid": 7, "ts": 6071193079490.915, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020391.443, "dur": 5.178, | |
"args": { | |
"External id": 37, "cbid": 307, "correlation": 170 | |
} | |
}, | |
{ | |
"ph": "s", "id": 170, "pid": 2337800, "tid": 2337800, "ts": 6071193020391.443, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6071193079493.507, "dur": 1.344, | |
"args": { | |
"External id": 41, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 181, "pid": 0, "tid": 7, "ts": 6071193079493.507, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020595.912, "dur": 8.402, | |
"args": { | |
"External id": 41, "cbid": 211, "correlation": 181 | |
} | |
}, | |
{ | |
"ph": "s", "id": 181, "pid": 2337800, "tid": 2337800, "ts": 6071193020595.912, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193020652.367, "dur": 2.083, | |
"args": { | |
"External id": 2, "cbid": 135, "correlation": 189 | |
} | |
}, | |
{ | |
"ph": "f", "id": 189, "pid": 2337800, "tid": 2337800, "ts": 6071193020652.367, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6071193079495.811, "dur": 61.088, | |
"args": { | |
"External id": 515, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 198, "pid": 0, "tid": 7, "ts": 6071193079495.811, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021185.510, "dur": 39.610, | |
"args": { | |
"External id": 515, "cbid": 307, "correlation": 198 | |
} | |
}, | |
{ | |
"ph": "s", "id": 198, "pid": 2337800, "tid": 2340515, "ts": 6071193021185.510, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6071193079557.763, "dur": 2.336, | |
"args": { | |
"External id": 516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 | |
} | |
}, | |
{ | |
"ph": "f", "id": 202, "pid": 0, "tid": 7, "ts": 6071193079557.763, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021248.836, "dur": 6.510, | |
"args": { | |
"External id": 516, "cbid": 307, "correlation": 202 | |
} | |
}, | |
{ | |
"ph": "s", "id": 202, "pid": 2337800, "tid": 2340515, "ts": 6071193021248.836, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193079561.091, "dur": 47.488, | |
"args": { | |
"External id": 517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 206, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 206, "pid": 0, "tid": 7, "ts": 6071193079561.091, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021291.610, "dur": 17.827, | |
"args": { | |
"External id": 517, "cbid": 307, "correlation": 206 | |
} | |
}, | |
{ | |
"ph": "s", "id": 206, "pid": 2337800, "tid": 2340515, "ts": 6071193021291.610, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021360.124, "dur": 2.644, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 211 | |
} | |
}, | |
{ | |
"ph": "f", "id": 211, "pid": 2337800, "tid": 2340515, "ts": 6071193021360.124, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021367.445, "dur": 0.420, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 216 | |
} | |
}, | |
{ | |
"ph": "f", "id": 216, "pid": 2337800, "tid": 2340515, "ts": 6071193021367.445, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021370.309, "dur": 0.431, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 221 | |
} | |
}, | |
{ | |
"ph": "f", "id": 221, "pid": 2337800, "tid": 2340515, "ts": 6071193021370.309, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193079609.571, "dur": 55.168, | |
"args": { | |
"External id": 530, "device": 0, "context": 1, "stream": 7, "correlation": 255, "bytes": 50331648, "memory bandwidth (GB/s)": 912.3341067285382 | |
} | |
}, | |
{ | |
"ph": "f", "id": 255, "pid": 0, "tid": 7, "ts": 6071193079609.571, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021504.792, "dur": 20.120, | |
"args": { | |
"External id": 530, "cbid": 41, "correlation": 255 | |
} | |
}, | |
{ | |
"ph": "s", "id": 255, "pid": 2337800, "tid": 2340515, "ts": 6071193021504.792, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7, | |
"ts": 6071193079671.587, "dur": 20918.622, | |
"args": { | |
"External id": 535, "device": 0, "context": 1, "stream": 7, "correlation": 261, "bytes": 50331648, "memory bandwidth (GB/s)": 2.4060690039716763 | |
} | |
}, | |
{ | |
"ph": "f", "id": 261, "pid": 0, "tid": 7, "ts": 6071193079671.587, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193021573.485, "dur": 79912.230, | |
"args": { | |
"External id": 535, "cbid": 41, "correlation": 261 | |
} | |
}, | |
{ | |
"ph": "s", "id": 261, "pid": 2337800, "tid": 2340515, "ts": 6071193021573.485, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193101491.093, "dur": 14.462, | |
"args": { | |
"External id": 535, "cbid": 131, "correlation": 262 | |
} | |
}, | |
{ | |
"ph": "s", "id": 262, "pid": 2337800, "tid": 2340515, "ts": 6071193101491.093, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6071193106980.256, "dur": 7.488, | |
"args": { | |
"External id": 49, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 300, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 300, "pid": 0, "tid": 7, "ts": 6071193106980.256, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193106938.697, "dur": 43.486, | |
"args": { | |
"External id": 49, "cbid": 307, "correlation": 300 | |
} | |
}, | |
{ | |
"ph": "s", "id": 300, "pid": 2337800, "tid": 2337800, "ts": 6071193106938.697, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6071193107016.736, "dur": 1.664, | |
"args": { | |
"External id": 50, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 308, "pid": 0, "tid": 7, "ts": 6071193107016.736, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107008.322, "dur": 7.912, | |
"args": { | |
"External id": 50, "cbid": 307, "correlation": 308 | |
} | |
}, | |
{ | |
"ph": "s", "id": 308, "pid": 2337800, "tid": 2337800, "ts": 6071193107008.322, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193107051.424, "dur": 44.128, | |
"args": { | |
"External id": 51, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 316, "pid": 0, "tid": 7, "ts": 6071193107051.424, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107042.694, "dur": 7.441, | |
"args": { | |
"External id": 51, "cbid": 307, "correlation": 316 | |
} | |
}, | |
{ | |
"ph": "s", "id": 316, "pid": 2337800, "tid": 2337800, "ts": 6071193107042.694, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6071193107187.168, "dur": 1.376, | |
"args": { | |
"External id": 72, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 323, "pid": 0, "tid": 7, "ts": 6071193107187.168, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107177.878, "dur": 7.912, | |
"args": { | |
"External id": 72, "cbid": 307, "correlation": 323 | |
} | |
}, | |
{ | |
"ph": "s", "id": 323, "pid": 2337800, "tid": 2337800, "ts": 6071193107177.878, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6071193107214.240, "dur": 32.256, | |
"args": { | |
"External id": 73, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 330, "pid": 0, "tid": 7, "ts": 6071193107214.240, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107206.431, "dur": 6.319, | |
"args": { | |
"External id": 73, "cbid": 307, "correlation": 330 | |
} | |
}, | |
{ | |
"ph": "s", "id": 330, "pid": 2337800, "tid": 2337800, "ts": 6071193107206.431, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6071193107247.392, "dur": 1.440, | |
"args": { | |
"External id": 74, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 337, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 337, "pid": 0, "tid": 7, "ts": 6071193107247.392, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107230.587, "dur": 7.331, | |
"args": { | |
"External id": 74, "cbid": 307, "correlation": 337 | |
} | |
}, | |
{ | |
"ph": "s", "id": 337, "pid": 2337800, "tid": 2337800, "ts": 6071193107230.587, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107368.966, "dur": 2.143, | |
"args": { | |
"External id": 75, "cbid": 200, "correlation": 352 | |
} | |
}, | |
{ | |
"ph": "f", "id": 352, "pid": 2337800, "tid": 2337800, "ts": 6071193107368.966, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193107384.320, "dur": 27572.893, | |
"args": { | |
"External id": 75, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 354, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 354, "pid": 0, "tid": 7, "ts": 6071193107384.320, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107374.654, "dur": 7.331, | |
"args": { | |
"External id": 75, "cbid": 307, "correlation": 354 | |
} | |
}, | |
{ | |
"ph": "s", "id": 354, "pid": 2337800, "tid": 2337800, "ts": 6071193107374.654, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6071193134958.237, "dur": 5611.168, | |
"args": { | |
"External id": 76, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 375, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 375, "pid": 0, "tid": 7, "ts": 6071193134958.237, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107427.794, "dur": 8.333, | |
"args": { | |
"External id": 76, "cbid": 307, "correlation": 375 | |
} | |
}, | |
{ | |
"ph": "s", "id": 375, "pid": 2337800, "tid": 2337800, "ts": 6071193107427.794, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107458.561, "dur": 0.441, | |
"args": { | |
"External id": 77, "cbid": 200, "correlation": 388 | |
} | |
}, | |
{ | |
"ph": "f", "id": 388, "pid": 2337800, "tid": 2337800, "ts": 6071193107458.561, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193140570.332, "dur": 19801.471, | |
"args": { | |
"External id": 77, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 391, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 391, "pid": 0, "tid": 7, "ts": 6071193140570.332, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107462.697, "dur": 5.609, | |
"args": { | |
"External id": 77, "cbid": 307, "correlation": 391 | |
} | |
}, | |
{ | |
"ph": "s", "id": 391, "pid": 2337800, "tid": 2337800, "ts": 6071193107462.697, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7, | |
"ts": 6071193160372.666, "dur": 1657.024, | |
"args": { | |
"External id": 78, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 402, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 402, "pid": 0, "tid": 7, "ts": 6071193160372.666, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107493.844, "dur": 7.391, | |
"args": { | |
"External id": 78, "cbid": 307, "correlation": 402 | |
} | |
}, | |
{ | |
"ph": "s", "id": 402, "pid": 2337800, "tid": 2337800, "ts": 6071193107493.844, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193162030.650, "dur": 75.200, | |
"args": { | |
"External id": 79, "device": 0, "context": 1, "stream": 7, "correlation": 409, "bytes": 77194752, "memory bandwidth (GB/s)": 1026.5259574468084 | |
} | |
}, | |
{ | |
"ph": "f", "id": 409, "pid": 0, "tid": 7, "ts": 6071193162030.650, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107531.160, "dur": 29.655, | |
"args": { | |
"External id": 79, "cbid": 41, "correlation": 409 | |
} | |
}, | |
{ | |
"ph": "s", "id": 409, "pid": 2337800, "tid": 2337800, "ts": 6071193107531.160, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107580.274, "dur": 0.271, | |
"args": { | |
"External id": 79, "cbid": 200, "correlation": 420 | |
} | |
}, | |
{ | |
"ph": "f", "id": 420, "pid": 2337800, "tid": 2337800, "ts": 6071193107580.274, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193162106.778, "dur": 5807.648, | |
"args": { | |
"External id": 79, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 423, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 423, "pid": 0, "tid": 7, "ts": 6071193162106.778, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107582.107, "dur": 5.929, | |
"args": { | |
"External id": 79, "cbid": 307, "correlation": 423 | |
} | |
}, | |
{ | |
"ph": "s", "id": 423, "pid": 2337800, "tid": 2337800, "ts": 6071193107582.107, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6071193167916.313, "dur": 2.688, | |
"args": { | |
"External id": 80, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 435, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 435, "pid": 0, "tid": 7, "ts": 6071193167916.313, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107624.000, "dur": 6.861, | |
"args": { | |
"External id": 80, "cbid": 307, "correlation": 435 | |
} | |
}, | |
{ | |
"ph": "s", "id": 435, "pid": 2337800, "tid": 2337800, "ts": 6071193107624.000, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7, | |
"ts": 6071193167919.929, "dur": 1.696, | |
"args": { | |
"External id": 81, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 440, "pid": 0, "tid": 7, "ts": 6071193167919.929, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107653.645, "dur": 6.119, | |
"args": { | |
"External id": 81, "cbid": 307, "correlation": 440 | |
} | |
}, | |
{ | |
"ph": "s", "id": 440, "pid": 2337800, "tid": 2337800, "ts": 6071193107653.645, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6071193167922.553, "dur": 1.376, | |
"args": { | |
"External id": 85, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 451, "pid": 0, "tid": 7, "ts": 6071193167922.553, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107827.848, "dur": 10.375, | |
"args": { | |
"External id": 85, "cbid": 211, "correlation": 451 | |
} | |
}, | |
{ | |
"ph": "s", "id": 451, "pid": 2337800, "tid": 2337800, "ts": 6071193107827.848, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193107884.503, "dur": 2.764, | |
"args": { | |
"External id": 46, "cbid": 135, "correlation": 459 | |
} | |
}, | |
{ | |
"ph": "f", "id": 459, "pid": 2337800, "tid": 2337800, "ts": 6071193107884.503, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6071193167924.762, "dur": 61.343, | |
"args": { | |
"External id": 538, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 468, "pid": 0, "tid": 7, "ts": 6071193167924.762, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108355.643, "dur": 32.590, | |
"args": { | |
"External id": 538, "cbid": 307, "correlation": 468 | |
} | |
}, | |
{ | |
"ph": "s", "id": 468, "pid": 2337800, "tid": 2340515, "ts": 6071193108355.643, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6071193167987.482, "dur": 2.399, | |
"args": { | |
"External id": 539, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 | |
} | |
}, | |
{ | |
"ph": "f", "id": 472, "pid": 0, "tid": 7, "ts": 6071193167987.482, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108412.139, "dur": 6.249, | |
"args": { | |
"External id": 539, "cbid": 307, "correlation": 472 | |
} | |
}, | |
{ | |
"ph": "s", "id": 472, "pid": 2337800, "tid": 2340515, "ts": 6071193108412.139, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193167990.809, "dur": 47.648, | |
"args": { | |
"External id": 540, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 476, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 476, "pid": 0, "tid": 7, "ts": 6071193167990.809, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108452.099, "dur": 5.208, | |
"args": { | |
"External id": 540, "cbid": 307, "correlation": 476 | |
} | |
}, | |
{ | |
"ph": "s", "id": 476, "pid": 2337800, "tid": 2340515, "ts": 6071193108452.099, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108501.533, "dur": 2.224, | |
"args": { | |
"External id": 536, "cbid": 135, "correlation": 481 | |
} | |
}, | |
{ | |
"ph": "f", "id": 481, "pid": 2337800, "tid": 2340515, "ts": 6071193108501.533, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108507.282, "dur": 0.411, | |
"args": { | |
"External id": 536, "cbid": 135, "correlation": 486 | |
} | |
}, | |
{ | |
"ph": "f", "id": 486, "pid": 2337800, "tid": 2340515, "ts": 6071193108507.282, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108510.106, "dur": 0.431, | |
"args": { | |
"External id": 536, "cbid": 135, "correlation": 491 | |
} | |
}, | |
{ | |
"ph": "f", "id": 491, "pid": 2337800, "tid": 2340515, "ts": 6071193108510.106, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193168039.321, "dur": 55.232, | |
"args": { | |
"External id": 553, "device": 0, "context": 1, "stream": 7, "correlation": 525, "bytes": 50331648, "memory bandwidth (GB/s)": 911.2769409038239 | |
} | |
}, | |
{ | |
"ph": "f", "id": 525, "pid": 0, "tid": 7, "ts": 6071193168039.321, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108626.061, "dur": 19.129, | |
"args": { | |
"External id": 553, "cbid": 41, "correlation": 525 | |
} | |
}, | |
{ | |
"ph": "s", "id": 525, "pid": 2337800, "tid": 2340515, "ts": 6071193108626.061, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7, | |
"ts": 6071193168104.057, "dur": 21092.190, | |
"args": { | |
"External id": 558, "device": 0, "context": 1, "stream": 7, "correlation": 531, "bytes": 50331648, "memory bandwidth (GB/s)": 2.386269420102891 | |
} | |
}, | |
{ | |
"ph": "f", "id": 531, "pid": 0, "tid": 7, "ts": 6071193168104.057, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193108693.763, "dur": 81783.001, | |
"args": { | |
"External id": 558, "cbid": 41, "correlation": 531 | |
} | |
}, | |
{ | |
"ph": "s", "id": 531, "pid": 2337800, "tid": 2340515, "ts": 6071193108693.763, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193190484.065, "dur": 15.543, | |
"args": { | |
"External id": 558, "cbid": 131, "correlation": 532 | |
} | |
}, | |
{ | |
"ph": "s", "id": 532, "pid": 2337800, "tid": 2340515, "ts": 6071193190484.065, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6071193195203.991, "dur": 7.136, | |
"args": { | |
"External id": 93, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 570, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 570, "pid": 0, "tid": 7, "ts": 6071193195203.991, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195157.671, "dur": 48.223, | |
"args": { | |
"External id": 93, "cbid": 307, "correlation": 570 | |
} | |
}, | |
{ | |
"ph": "s", "id": 570, "pid": 2337800, "tid": 2337800, "ts": 6071193195157.671, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6071193195243.383, "dur": 1.600, | |
"args": { | |
"External id": 94, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 578, "pid": 0, "tid": 7, "ts": 6071193195243.383, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195234.337, "dur": 8.353, | |
"args": { | |
"External id": 94, "cbid": 307, "correlation": 578 | |
} | |
}, | |
{ | |
"ph": "s", "id": 578, "pid": 2337800, "tid": 2337800, "ts": 6071193195234.337, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193195280.983, "dur": 43.840, | |
"args": { | |
"External id": 95, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 586, "pid": 0, "tid": 7, "ts": 6071193195280.983, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195272.645, "dur": 7.321, | |
"args": { | |
"External id": 95, "cbid": 307, "correlation": 586 | |
} | |
}, | |
{ | |
"ph": "s", "id": 586, "pid": 2337800, "tid": 2337800, "ts": 6071193195272.645, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6071193195454.198, "dur": 1.377, | |
"args": { | |
"External id": 116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 593, "pid": 0, "tid": 7, "ts": 6071193195454.198, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195444.274, "dur": 8.432, | |
"args": { | |
"External id": 116, "cbid": 307, "correlation": 593 | |
} | |
}, | |
{ | |
"ph": "s", "id": 593, "pid": 2337800, "tid": 2337800, "ts": 6071193195444.274, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6071193195481.687, "dur": 31.552, | |
"args": { | |
"External id": 117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 600, "pid": 0, "tid": 7, "ts": 6071193195481.687, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195473.848, "dur": 6.530, | |
"args": { | |
"External id": 117, "cbid": 307, "correlation": 600 | |
} | |
}, | |
{ | |
"ph": "s", "id": 600, "pid": 2337800, "tid": 2337800, "ts": 6071193195473.848, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6071193195514.135, "dur": 1.376, | |
"args": { | |
"External id": 118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 607, "pid": 0, "tid": 7, "ts": 6071193195514.135, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195499.106, "dur": 6.580, | |
"args": { | |
"External id": 118, "cbid": 307, "correlation": 607 | |
} | |
}, | |
{ | |
"ph": "s", "id": 607, "pid": 2337800, "tid": 2337800, "ts": 6071193195499.106, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195626.779, "dur": 2.503, | |
"args": { | |
"External id": 119, "cbid": 200, "correlation": 622 | |
} | |
}, | |
{ | |
"ph": "f", "id": 622, "pid": 2337800, "tid": 2337800, "ts": 6071193195626.779, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193195643.095, "dur": 26833.788, | |
"args": { | |
"External id": 119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 624, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 624, "pid": 0, "tid": 7, "ts": 6071193195643.095, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195632.868, "dur": 8.302, | |
"args": { | |
"External id": 119, "cbid": 307, "correlation": 624 | |
} | |
}, | |
{ | |
"ph": "s", "id": 624, "pid": 2337800, "tid": 2337800, "ts": 6071193195632.868, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6071193222477.876, "dur": 5565.599, | |
"args": { | |
"External id": 120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 645, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 645, "pid": 0, "tid": 7, "ts": 6071193222477.876, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195684.486, "dur": 8.132, | |
"args": { | |
"External id": 120, "cbid": 307, "correlation": 645 | |
} | |
}, | |
{ | |
"ph": "s", "id": 645, "pid": 2337800, "tid": 2337800, "ts": 6071193195684.486, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195716.875, "dur": 0.430, | |
"args": { | |
"External id": 121, "cbid": 200, "correlation": 658 | |
} | |
}, | |
{ | |
"ph": "f", "id": 658, "pid": 2337800, "tid": 2337800, "ts": 6071193195716.875, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193228044.435, "dur": 19615.038, | |
"args": { | |
"External id": 121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 661, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 661, "pid": 0, "tid": 7, "ts": 6071193228044.435, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195721.111, "dur": 5.428, | |
"args": { | |
"External id": 121, "cbid": 307, "correlation": 661 | |
} | |
}, | |
{ | |
"ph": "s", "id": 661, "pid": 2337800, "tid": 2337800, "ts": 6071193195721.111, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7, | |
"ts": 6071193247660.369, "dur": 1660.351, | |
"args": { | |
"External id": 122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 672, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 672, "pid": 0, "tid": 7, "ts": 6071193247660.369, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195753.209, "dur": 6.720, | |
"args": { | |
"External id": 122, "cbid": 307, "correlation": 672 | |
} | |
}, | |
{ | |
"ph": "s", "id": 672, "pid": 2337800, "tid": 2337800, "ts": 6071193195753.209, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193249321.617, "dur": 75.328, | |
"args": { | |
"External id": 123, "device": 0, "context": 1, "stream": 7, "correlation": 679, "bytes": 77194752, "memory bandwidth (GB/s)": 1024.7816482582837 | |
} | |
}, | |
{ | |
"ph": "f", "id": 679, "pid": 0, "tid": 7, "ts": 6071193249321.617, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195794.952, "dur": 30.246, | |
"args": { | |
"External id": 123, "cbid": 41, "correlation": 679 | |
} | |
}, | |
{ | |
"ph": "s", "id": 679, "pid": 2337800, "tid": 2337800, "ts": 6071193195794.952, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195846.049, "dur": 0.651, | |
"args": { | |
"External id": 123, "cbid": 200, "correlation": 690 | |
} | |
}, | |
{ | |
"ph": "f", "id": 690, "pid": 2337800, "tid": 2337800, "ts": 6071193195846.049, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193249397.744, "dur": 5816.192, | |
"args": { | |
"External id": 123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 693, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 693, "pid": 0, "tid": 7, "ts": 6071193249397.744, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195848.874, "dur": 5.508, | |
"args": { | |
"External id": 123, "cbid": 307, "correlation": 693 | |
} | |
}, | |
{ | |
"ph": "s", "id": 693, "pid": 2337800, "tid": 2337800, "ts": 6071193195848.874, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6071193255215.760, "dur": 2.656, | |
"args": { | |
"External id": 124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 705, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 705, "pid": 0, "tid": 7, "ts": 6071193255215.760, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195892.219, "dur": 6.600, | |
"args": { | |
"External id": 124, "cbid": 307, "correlation": 705 | |
} | |
}, | |
{ | |
"ph": "s", "id": 705, "pid": 2337800, "tid": 2337800, "ts": 6071193195892.219, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7, | |
"ts": 6071193255219.376, "dur": 1.696, | |
"args": { | |
"External id": 125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 710, "pid": 0, "tid": 7, "ts": 6071193255219.376, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193195920.772, "dur": 5.518, | |
"args": { | |
"External id": 125, "cbid": 307, "correlation": 710 | |
} | |
}, | |
{ | |
"ph": "s", "id": 710, "pid": 2337800, "tid": 2337800, "ts": 6071193195920.772, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6071193255221.968, "dur": 1.376, | |
"args": { | |
"External id": 129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 721, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 721, "pid": 0, "tid": 7, "ts": 6071193255221.968, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196109.667, "dur": 9.504, | |
"args": { | |
"External id": 129, "cbid": 211, "correlation": 721 | |
} | |
}, | |
{ | |
"ph": "s", "id": 721, "pid": 2337800, "tid": 2337800, "ts": 6071193196109.667, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193196161.815, "dur": 3.145, | |
"args": { | |
"External id": 90, "cbid": 135, "correlation": 729 | |
} | |
}, | |
{ | |
"ph": "f", "id": 729, "pid": 2337800, "tid": 2337800, "ts": 6071193196161.815, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6071193255224.304, "dur": 61.536, | |
"args": { | |
"External id": 561, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 738, "pid": 0, "tid": 7, "ts": 6071193255224.304, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196642.580, "dur": 38.368, | |
"args": { | |
"External id": 561, "cbid": 307, "correlation": 738 | |
} | |
}, | |
{ | |
"ph": "s", "id": 738, "pid": 2337800, "tid": 2340515, "ts": 6071193196642.580, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6071193255286.832, "dur": 2.208, | |
"args": { | |
"External id": 562, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 | |
} | |
}, | |
{ | |
"ph": "f", "id": 742, "pid": 0, "tid": 7, "ts": 6071193255286.832, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196703.392, "dur": 7.441, | |
"args": { | |
"External id": 562, "cbid": 307, "correlation": 742 | |
} | |
}, | |
{ | |
"ph": "s", "id": 742, "pid": 2337800, "tid": 2340515, "ts": 6071193196703.392, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193255289.936, "dur": 47.328, | |
"args": { | |
"External id": 563, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 746, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 746, "pid": 0, "tid": 7, "ts": 6071193255289.936, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196744.724, "dur": 6.270, | |
"args": { | |
"External id": 563, "cbid": 307, "correlation": 746 | |
} | |
}, | |
{ | |
"ph": "s", "id": 746, "pid": 2337800, "tid": 2340515, "ts": 6071193196744.724, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196796.743, "dur": 2.944, | |
"args": { | |
"External id": 559, "cbid": 135, "correlation": 751 | |
} | |
}, | |
{ | |
"ph": "f", "id": 751, "pid": 2337800, "tid": 2340515, "ts": 6071193196796.743, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196804.314, "dur": 0.691, | |
"args": { | |
"External id": 559, "cbid": 135, "correlation": 756 | |
} | |
}, | |
{ | |
"ph": "f", "id": 756, "pid": 2337800, "tid": 2340515, "ts": 6071193196804.314, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196808.220, "dur": 0.501, | |
"args": { | |
"External id": 559, "cbid": 135, "correlation": 761 | |
} | |
}, | |
{ | |
"ph": "f", "id": 761, "pid": 2337800, "tid": 2340515, "ts": 6071193196808.220, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193255338.224, "dur": 55.488, | |
"args": { | |
"External id": 576, "device": 0, "context": 1, "stream": 7, "correlation": 795, "bytes": 50331648, "memory bandwidth (GB/s)": 907.0726643598616 | |
} | |
}, | |
{ | |
"ph": "f", "id": 795, "pid": 0, "tid": 7, "ts": 6071193255338.224, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193196940.860, "dur": 21.042, | |
"args": { | |
"External id": 576, "cbid": 41, "correlation": 795 | |
} | |
}, | |
{ | |
"ph": "s", "id": 795, "pid": 2337800, "tid": 2340515, "ts": 6071193196940.860, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7, | |
"ts": 6071193255397.168, "dur": 21805.245, | |
"args": { | |
"External id": 581, "device": 0, "context": 1, "stream": 7, "correlation": 801, "bytes": 50331648, "memory bandwidth (GB/s)": 2.3082358395881357 | |
} | |
}, | |
{ | |
"ph": "f", "id": 801, "pid": 0, "tid": 7, "ts": 6071193255397.168, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193197021.381, "dur": 81131.458, | |
"args": { | |
"External id": 581, "cbid": 41, "correlation": 801 | |
} | |
}, | |
{ | |
"ph": "s", "id": 801, "pid": 2337800, "tid": 2340515, "ts": 6071193197021.381, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193278157.306, "dur": 14.943, | |
"args": { | |
"External id": 581, "cbid": 131, "correlation": 802 | |
} | |
}, | |
{ | |
"ph": "s", "id": 802, "pid": 2337800, "tid": 2340515, "ts": 6071193278157.306, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6071193282883.853, "dur": 6.976, | |
"args": { | |
"External id": 137, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 840, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 840, "pid": 0, "tid": 7, "ts": 6071193282883.853, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282835.910, "dur": 50.006, | |
"args": { | |
"External id": 137, "cbid": 307, "correlation": 840 | |
} | |
}, | |
{ | |
"ph": "s", "id": 840, "pid": 2337800, "tid": 2337800, "ts": 6071193282835.910, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6071193282921.677, "dur": 1.632, | |
"args": { | |
"External id": 138, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 848, "pid": 0, "tid": 7, "ts": 6071193282921.677, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282912.726, "dur": 8.343, | |
"args": { | |
"External id": 138, "cbid": 307, "correlation": 848 | |
} | |
}, | |
{ | |
"ph": "s", "id": 848, "pid": 2337800, "tid": 2337800, "ts": 6071193282912.726, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193282956.461, "dur": 43.712, | |
"args": { | |
"External id": 139, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 856, "pid": 0, "tid": 7, "ts": 6071193282956.461, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193282947.699, "dur": 7.772, | |
"args": { | |
"External id": 139, "cbid": 307, "correlation": 856 | |
} | |
}, | |
{ | |
"ph": "s", "id": 856, "pid": 2337800, "tid": 2337800, "ts": 6071193282947.699, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6071193283105.069, "dur": 1.344, | |
"args": { | |
"External id": 160, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 863, "pid": 0, "tid": 7, "ts": 6071193283105.069, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283094.821, "dur": 9.114, | |
"args": { | |
"External id": 160, "cbid": 307, "correlation": 863 | |
} | |
}, | |
{ | |
"ph": "s", "id": 863, "pid": 2337800, "tid": 2337800, "ts": 6071193283094.821, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6071193283132.941, "dur": 32.000, | |
"args": { | |
"External id": 161, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 870, "pid": 0, "tid": 7, "ts": 6071193283132.941, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283124.936, "dur": 6.520, | |
"args": { | |
"External id": 161, "cbid": 307, "correlation": 870 | |
} | |
}, | |
{ | |
"ph": "s", "id": 870, "pid": 2337800, "tid": 2337800, "ts": 6071193283124.936, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6071193283165.965, "dur": 1.408, | |
"args": { | |
"External id": 162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 877, "pid": 0, "tid": 7, "ts": 6071193283165.965, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283150.575, "dur": 7.061, | |
"args": { | |
"External id": 162, "cbid": 307, "correlation": 877 | |
} | |
}, | |
{ | |
"ph": "s", "id": 877, "pid": 2337800, "tid": 2337800, "ts": 6071193283150.575, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283284.246, "dur": 2.935, | |
"args": { | |
"External id": 163, "cbid": 200, "correlation": 892 | |
} | |
}, | |
{ | |
"ph": "f", "id": 892, "pid": 2337800, "tid": 2337800, "ts": 6071193283284.246, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193283326.669, "dur": 26839.005, | |
"args": { | |
"External id": 163, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 894, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 894, "pid": 0, "tid": 7, "ts": 6071193283326.669, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283291.027, "dur": 33.410, | |
"args": { | |
"External id": 163, "cbid": 307, "correlation": 894 | |
} | |
}, | |
{ | |
"ph": "s", "id": 894, "pid": 2337800, "tid": 2337800, "ts": 6071193283291.027, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6071193310166.602, "dur": 5565.183, | |
"args": { | |
"External id": 164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 915, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 915, "pid": 0, "tid": 7, "ts": 6071193310166.602, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283371.638, "dur": 8.934, | |
"args": { | |
"External id": 164, "cbid": 307, "correlation": 915 | |
} | |
}, | |
{ | |
"ph": "s", "id": 915, "pid": 2337800, "tid": 2337800, "ts": 6071193283371.638, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283405.710, "dur": 0.430, | |
"args": { | |
"External id": 165, "cbid": 200, "correlation": 928 | |
} | |
}, | |
{ | |
"ph": "f", "id": 928, "pid": 2337800, "tid": 2337800, "ts": 6071193283405.710, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193315732.585, "dur": 19614.879, | |
"args": { | |
"External id": 165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 931, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 931, "pid": 0, "tid": 7, "ts": 6071193315732.585, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283410.577, "dur": 6.109, | |
"args": { | |
"External id": 165, "cbid": 307, "correlation": 931 | |
} | |
}, | |
{ | |
"ph": "s", "id": 931, "pid": 2337800, "tid": 2337800, "ts": 6071193283410.577, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7, | |
"ts": 6071193335348.392, "dur": 1660.319, | |
"args": { | |
"External id": 166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 942, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 942, "pid": 0, "tid": 7, "ts": 6071193335348.392, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283444.258, "dur": 7.010, | |
"args": { | |
"External id": 166, "cbid": 307, "correlation": 942 | |
} | |
}, | |
{ | |
"ph": "s", "id": 942, "pid": 2337800, "tid": 2337800, "ts": 6071193283444.258, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193337009.703, "dur": 75.104, | |
"args": { | |
"External id": 167, "device": 0, "context": 1, "stream": 7, "correlation": 949, "bytes": 77194752, "memory bandwidth (GB/s)": 1027.83809118023 | |
} | |
}, | |
{ | |
"ph": "f", "id": 949, "pid": 0, "tid": 7, "ts": 6071193337009.703, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283483.206, "dur": 32.389, | |
"args": { | |
"External id": 167, "cbid": 41, "correlation": 949 | |
} | |
}, | |
{ | |
"ph": "s", "id": 949, "pid": 2337800, "tid": 2337800, "ts": 6071193283483.206, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283536.787, "dur": 0.671, | |
"args": { | |
"External id": 167, "cbid": 200, "correlation": 960 | |
} | |
}, | |
{ | |
"ph": "f", "id": 960, "pid": 2337800, "tid": 2337800, "ts": 6071193283536.787, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193337085.767, "dur": 5816.352, | |
"args": { | |
"External id": 167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 963, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 963, "pid": 0, "tid": 7, "ts": 6071193337085.767, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283539.782, "dur": 5.698, | |
"args": { | |
"External id": 167, "cbid": 307, "correlation": 963 | |
} | |
}, | |
{ | |
"ph": "s", "id": 963, "pid": 2337800, "tid": 2337800, "ts": 6071193283539.782, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6071193342904.135, "dur": 2.688, | |
"args": { | |
"External id": 168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 975, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 975, "pid": 0, "tid": 7, "ts": 6071193342904.135, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283583.748, "dur": 6.540, | |
"args": { | |
"External id": 168, "cbid": 307, "correlation": 975 | |
} | |
}, | |
{ | |
"ph": "s", "id": 975, "pid": 2337800, "tid": 2337800, "ts": 6071193283583.748, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7, | |
"ts": 6071193342907.783, "dur": 1.696, | |
"args": { | |
"External id": 169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 980, "pid": 0, "tid": 7, "ts": 6071193342907.783, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283614.064, "dur": 6.059, | |
"args": { | |
"External id": 169, "cbid": 307, "correlation": 980 | |
} | |
}, | |
{ | |
"ph": "s", "id": 980, "pid": 2337800, "tid": 2337800, "ts": 6071193283614.064, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6071193342910.375, "dur": 1.344, | |
"args": { | |
"External id": 173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 991, "pid": 0, "tid": 7, "ts": 6071193342910.375, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283816.599, "dur": 10.216, | |
"args": { | |
"External id": 173, "cbid": 211, "correlation": 991 | |
} | |
}, | |
{ | |
"ph": "s", "id": 991, "pid": 2337800, "tid": 2337800, "ts": 6071193283816.599, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193283878.753, "dur": 2.854, | |
"args": { | |
"External id": 134, "cbid": 135, "correlation": 999 | |
} | |
}, | |
{ | |
"ph": "f", "id": 999, "pid": 2337800, "tid": 2337800, "ts": 6071193283878.753, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6071193342912.583, "dur": 61.152, | |
"args": { | |
"External id": 584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1008, "pid": 0, "tid": 7, "ts": 6071193342912.583, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284369.032, "dur": 38.278, | |
"args": { | |
"External id": 584, "cbid": 307, "correlation": 1008 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1008, "pid": 2337800, "tid": 2340515, "ts": 6071193284369.032, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6071193342975.655, "dur": 2.240, | |
"args": { | |
"External id": 585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1012, "pid": 0, "tid": 7, "ts": 6071193342975.655, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284431.006, "dur": 6.690, | |
"args": { | |
"External id": 585, "cbid": 307, "correlation": 1012 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1012, "pid": 2337800, "tid": 2340515, "ts": 6071193284431.006, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193342978.791, "dur": 47.776, | |
"args": { | |
"External id": 586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1016, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1016, "pid": 0, "tid": 7, "ts": 6071193342978.791, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284473.079, "dur": 6.610, | |
"args": { | |
"External id": 586, "cbid": 307, "correlation": 1016 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1016, "pid": 2337800, "tid": 2340515, "ts": 6071193284473.079, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284525.608, "dur": 2.574, | |
"args": { | |
"External id": 582, "cbid": 135, "correlation": 1021 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1021, "pid": 2337800, "tid": 2340515, "ts": 6071193284525.608, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284532.348, "dur": 0.651, | |
"args": { | |
"External id": 582, "cbid": 135, "correlation": 1026 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1026, "pid": 2337800, "tid": 2340515, "ts": 6071193284532.348, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284536.034, "dur": 0.501, | |
"args": { | |
"External id": 582, "cbid": 135, "correlation": 1031 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1031, "pid": 2337800, "tid": 2340515, "ts": 6071193284536.034, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193343027.431, "dur": 55.616, | |
"args": { | |
"External id": 599, "device": 0, "context": 1, "stream": 7, "correlation": 1065, "bytes": 50331648, "memory bandwidth (GB/s)": 904.9850402761795 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1065, "pid": 0, "tid": 7, "ts": 6071193343027.431, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284673.812, "dur": 20.631, | |
"args": { | |
"External id": 599, "cbid": 41, "correlation": 1065 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1065, "pid": 2337800, "tid": 2340515, "ts": 6071193284673.812, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7, | |
"ts": 6071193343085.287, "dur": 20094.557, | |
"args": { | |
"External id": 604, "device": 0, "context": 1, "stream": 7, "correlation": 1071, "bytes": 50331648, "memory bandwidth (GB/s)": 2.5047403632734975 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1071, "pid": 0, "tid": 7, "ts": 6071193343085.287, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193284754.183, "dur": 79179.284, | |
"args": { | |
"External id": 604, "cbid": 41, "correlation": 1071 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1071, "pid": 2337800, "tid": 2340515, "ts": 6071193284754.183, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193363938.906, "dur": 14.341, | |
"args": { | |
"External id": 604, "cbid": 131, "correlation": 1072 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1072, "pid": 2337800, "tid": 2340515, "ts": 6071193363938.906, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6071193369491.492, "dur": 7.136, | |
"args": { | |
"External id": 181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1110, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1110, "pid": 0, "tid": 7, "ts": 6071193369491.492, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369448.282, "dur": 45.078, | |
"args": { | |
"External id": 181, "cbid": 307, "correlation": 1110 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1110, "pid": 2337800, "tid": 2337800, "ts": 6071193369448.282, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6071193369528.836, "dur": 1.568, | |
"args": { | |
"External id": 182, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1118, "pid": 0, "tid": 7, "ts": 6071193369528.836, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369519.770, "dur": 8.163, | |
"args": { | |
"External id": 182, "cbid": 307, "correlation": 1118 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1118, "pid": 2337800, "tid": 2337800, "ts": 6071193369519.770, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193369564.740, "dur": 43.712, | |
"args": { | |
"External id": 183, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1126, "pid": 0, "tid": 7, "ts": 6071193369564.740, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369556.275, "dur": 7.532, | |
"args": { | |
"External id": 183, "cbid": 307, "correlation": 1126 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1126, "pid": 2337800, "tid": 2337800, "ts": 6071193369556.275, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6071193369710.852, "dur": 1.344, | |
"args": { | |
"External id": 204, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1133, "pid": 0, "tid": 7, "ts": 6071193369710.852, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369700.753, "dur": 8.763, | |
"args": { | |
"External id": 204, "cbid": 307, "correlation": 1133 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1133, "pid": 2337800, "tid": 2337800, "ts": 6071193369700.753, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6071193369738.820, "dur": 32.160, | |
"args": { | |
"External id": 205, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1140, "pid": 0, "tid": 7, "ts": 6071193369738.820, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369731.249, "dur": 6.159, | |
"args": { | |
"External id": 205, "cbid": 307, "correlation": 1140 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1140, "pid": 2337800, "tid": 2337800, "ts": 6071193369731.249, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6071193369771.844, "dur": 1.408, | |
"args": { | |
"External id": 206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1147, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1147, "pid": 0, "tid": 7, "ts": 6071193369771.844, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369757.729, "dur": 6.900, | |
"args": { | |
"External id": 206, "cbid": 307, "correlation": 1147 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1147, "pid": 2337800, "tid": 2337800, "ts": 6071193369757.729, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369879.172, "dur": 2.454, | |
"args": { | |
"External id": 207, "cbid": 200, "correlation": 1162 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1162, "pid": 2337800, "tid": 2337800, "ts": 6071193369879.172, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193369894.628, "dur": 26815.421, | |
"args": { | |
"External id": 207, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1164, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1164, "pid": 0, "tid": 7, "ts": 6071193369894.628, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369884.981, "dur": 7.681, | |
"args": { | |
"External id": 207, "cbid": 307, "correlation": 1164 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1164, "pid": 2337800, "tid": 2337800, "ts": 6071193369884.981, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6071193396711.009, "dur": 5565.087, | |
"args": { | |
"External id": 208, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1185, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1185, "pid": 0, "tid": 7, "ts": 6071193396711.009, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369934.876, "dur": 8.223, | |
"args": { | |
"External id": 208, "cbid": 307, "correlation": 1185 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1185, "pid": 2337800, "tid": 2337800, "ts": 6071193369934.876, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369966.644, "dur": 0.421, | |
"args": { | |
"External id": 209, "cbid": 200, "correlation": 1198 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1198, "pid": 2337800, "tid": 2337800, "ts": 6071193369966.644, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193402277.056, "dur": 19591.422, | |
"args": { | |
"External id": 209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1201, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1201, "pid": 0, "tid": 7, "ts": 6071193402277.056, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193369971.001, "dur": 5.558, | |
"args": { | |
"External id": 209, "cbid": 307, "correlation": 1201 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1201, "pid": 2337800, "tid": 2337800, "ts": 6071193369971.001, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7, | |
"ts": 6071193421869.374, "dur": 1662.208, | |
"args": { | |
"External id": 210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1212, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1212, "pid": 0, "tid": 7, "ts": 6071193421869.374, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370004.912, "dur": 6.970, | |
"args": { | |
"External id": 210, "cbid": 307, "correlation": 1212 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1212, "pid": 2337800, "tid": 2337800, "ts": 6071193370004.912, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193423532.445, "dur": 75.137, | |
"args": { | |
"External id": 211, "device": 0, "context": 1, "stream": 7, "correlation": 1219, "bytes": 77194752, "memory bandwidth (GB/s)": 1027.3866670215739 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1219, "pid": 0, "tid": 7, "ts": 6071193423532.445, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370041.126, "dur": 29.795, | |
"args": { | |
"External id": 211, "cbid": 41, "correlation": 1219 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1219, "pid": 2337800, "tid": 2337800, "ts": 6071193370041.126, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370091.743, "dur": 0.570, | |
"args": { | |
"External id": 211, "cbid": 200, "correlation": 1230 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1230, "pid": 2337800, "tid": 2337800, "ts": 6071193370091.743, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6071193423608.414, "dur": 5815.999, | |
"args": { | |
"External id": 211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1233, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1233, "pid": 0, "tid": 7, "ts": 6071193423608.414, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370094.617, "dur": 5.909, | |
"args": { | |
"External id": 211, "cbid": 307, "correlation": 1233 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1233, "pid": 2337800, "tid": 2337800, "ts": 6071193370094.617, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6071193429426.333, "dur": 2.848, | |
"args": { | |
"External id": 212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1245, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1245, "pid": 0, "tid": 7, "ts": 6071193429426.333, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370137.542, "dur": 7.130, | |
"args": { | |
"External id": 212, "cbid": 307, "correlation": 1245 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1245, "pid": 2337800, "tid": 2337800, "ts": 6071193370137.542, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7, | |
"ts": 6071193429430.173, "dur": 1.696, | |
"args": { | |
"External id": 213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1250, "pid": 0, "tid": 7, "ts": 6071193429430.173, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370168.789, "dur": 6.550, | |
"args": { | |
"External id": 213, "cbid": 307, "correlation": 1250 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1250, "pid": 2337800, "tid": 2337800, "ts": 6071193370168.789, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6071193429432.765, "dur": 1.344, | |
"args": { | |
"External id": 217, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1261, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1261, "pid": 0, "tid": 7, "ts": 6071193429432.765, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370377.203, "dur": 10.005, | |
"args": { | |
"External id": 217, "cbid": 211, "correlation": 1261 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1261, "pid": 2337800, "tid": 2337800, "ts": 6071193370377.203, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193370425.235, "dur": 2.865, | |
"args": { | |
"External id": 178, "cbid": 135, "correlation": 1269 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1269, "pid": 2337800, "tid": 2337800, "ts": 6071193370425.235, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6071193429435.101, "dur": 61.184, | |
"args": { | |
"External id": 607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1278, "pid": 0, "tid": 7, "ts": 6071193429435.101, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370876.065, "dur": 33.871, | |
"args": { | |
"External id": 607, "cbid": 307, "correlation": 1278 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1278, "pid": 2337800, "tid": 2340515, "ts": 6071193370876.065, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6071193429497.181, "dur": 2.336, | |
"args": { | |
"External id": 608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1282, "pid": 0, "tid": 7, "ts": 6071193429497.181, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370932.911, "dur": 5.939, | |
"args": { | |
"External id": 608, "cbid": 307, "correlation": 1282 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1282, "pid": 2337800, "tid": 2340515, "ts": 6071193370932.911, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6071193429500.477, "dur": 48.352, | |
"args": { | |
"External id": 609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1286, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1286, "pid": 0, "tid": 7, "ts": 6071193429500.477, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193370972.390, "dur": 5.138, | |
"args": { | |
"External id": 609, "cbid": 307, "correlation": 1286 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1286, "pid": 2337800, "tid": 2340515, "ts": 6071193370972.390, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371020.253, "dur": 2.604, | |
"args": { | |
"External id": 605, "cbid": 135, "correlation": 1291 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1291, "pid": 2337800, "tid": 2340515, "ts": 6071193371020.253, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371026.472, "dur": 0.501, | |
"args": { | |
"External id": 605, "cbid": 135, "correlation": 1296 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1296, "pid": 2337800, "tid": 2340515, "ts": 6071193371026.472, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371029.236, "dur": 0.441, | |
"args": { | |
"External id": 605, "cbid": 135, "correlation": 1301 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1301, "pid": 2337800, "tid": 2340515, "ts": 6071193371029.236, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6071193429549.757, "dur": 55.424, | |
"args": { | |
"External id": 622, "device": 0, "context": 1, "stream": 7, "correlation": 1335, "bytes": 50331648, "memory bandwidth (GB/s)": 908.1200923787529 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1335, "pid": 0, "tid": 7, "ts": 6071193429549.757, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371144.450, "dur": 16.475, | |
"args": { | |
"External id": 622, "cbid": 41, "correlation": 1335 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1335, "pid": 2337800, "tid": 2340515, "ts": 6071193371144.450, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7, | |
"ts": 6071193429607.261, "dur": 21396.606, | |
"args": { | |
"External id": 627, "device": 0, "context": 1, "stream": 7, "correlation": 1341, "bytes": 50331648, "memory bandwidth (GB/s)": 2.352319241659168 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1341, "pid": 0, "tid": 7, "ts": 6071193429607.261, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193371214.405, "dur": 80904.877, | |
"args": { | |
"External id": 627, "cbid": 41, "correlation": 1341 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1341, "pid": 2337800, "tid": 2340515, "ts": 6071193371214.405, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515, | |
"ts": 6071193452123.899, "dur": 14.261, | |
"args": { | |
"External id": 627, "cbid": 131, "correlation": 1342 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1342, "pid": 2337800, "tid": 2340515, "ts": 6071193452123.899, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 2337800, "tid": 2337800, | |
"ts": 6071193456547.529, "dur": 31.217, | |
"args": { | |
"cbid": 165, "correlation": 1353 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1353, "pid": 2337800, "tid": 2337800, "ts": 6071193456547.529, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0, | |
"args": { | |
"labels": "CPU" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0, | |
"args": { | |
"sort_index": 2337800 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0, | |
"args": { | |
"labels": "GPU 0" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0, | |
"args": { | |
"sort_index": 5000000 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0, | |
"args": { | |
"labels": "GPU 1" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0, | |
"args": { | |
"sort_index": 5000001 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0, | |
"args": { | |
"labels": "GPU 2" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0, | |
"args": { | |
"sort_index": 5000002 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0, | |
"args": { | |
"labels": "GPU 3" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0, | |
"args": { | |
"sort_index": 5000003 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0, | |
"args": { | |
"labels": "GPU 4" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0, | |
"args": { | |
"sort_index": 5000004 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0, | |
"args": { | |
"labels": "GPU 5" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0, | |
"args": { | |
"sort_index": 5000005 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0, | |
"args": { | |
"labels": "GPU 6" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0, | |
"args": { | |
"sort_index": 5000006 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0, | |
"args": { | |
"labels": "GPU 7" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0, | |
"args": { | |
"sort_index": 5000007 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0, | |
"args": { | |
"labels": "GPU 8" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0, | |
"args": { | |
"sort_index": 5000008 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0, | |
"args": { | |
"labels": "GPU 9" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0, | |
"args": { | |
"sort_index": 5000009 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0, | |
"args": { | |
"labels": "GPU 10" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0, | |
"args": { | |
"sort_index": 5000010 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0, | |
"args": { | |
"labels": "GPU 11" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0, | |
"args": { | |
"sort_index": 5000011 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0, | |
"args": { | |
"labels": "GPU 12" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0, | |
"args": { | |
"sort_index": 5000012 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0, | |
"args": { | |
"labels": "GPU 13" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0, | |
"args": { | |
"sort_index": 5000013 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0, | |
"args": { | |
"labels": "GPU 14" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0, | |
"args": { | |
"sort_index": 5000014 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0, | |
"args": { | |
"labels": "GPU 15" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0, | |
"args": { | |
"sort_index": 5000015 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 7, | |
"args": { | |
"name": "stream 7 " | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 7, | |
"args": { | |
"sort_index": 7 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515, | |
"args": { | |
"name": "thread 2340515 (pt_autograd_0)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515, | |
"args": { | |
"sort_index": 2340515 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515, | |
"args": { | |
"name": "thread 2340515 (python)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515, | |
"args": { | |
"sort_index": 2340515 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2337800, | |
"args": { | |
"name": "thread 2337800 (python)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2337800, | |
"args": { | |
"sort_index": 2337800 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "Trace", "ts": 6071193017369.210, "dur": 439222.959, | |
"pid": "Spans", "tid": "PyTorch Profiler", | |
"name": "PyTorch Profiler (0)", | |
"args": { | |
"Op count": 0 | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6071193017369.210, | |
"pid": "Spans", "tid": 0, | |
"args": { | |
"sort_index": 536870912 | |
} | |
}, | |
{ | |
"name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", | |
"pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6071193017369.210 | |
}, | |
{ | |
"name": "Record Window End", "ph": "i", "s": "g", | |
"pid": "", "tid": "", "ts": 6071193457002.819 | |
} | |
], | |
"traceName": "/tmp/trace.json" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment