Skip to content

Instantly share code, notes, and snippets.

@shunting314
Created June 10, 2025 22:00
Show Gist options
  • Save shunting314/3e86692884f446bb8714d6b8c83e4079 to your computer and use it in GitHub Desktop.
Save shunting314/3e86692884f446bb8714d6b8c83e4079 to your computer and use it in GitHub Desktop.
{
"schemaVersion": 1,
"deviceProperties": [
{
"id": 0, "name": "NVIDIA H100", "totalGlobalMem": 102010781696,
"computeMajor": 9, "computeMinor": 0,
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
"regsPerBlock": 65536, "warpSize": 32,
"sharedMemPerBlock": 49152, "numSms": 132
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
}
],
"cupti_version": 24,
"cuda_runtime_version": 12060,
"cuda_driver_version": 12020,
"trace_id": "B34F20FE8F5E46E7896FC32BAFC99ABC",
"displayTimeUnit": "ms",
"baseTimeNanoseconds": 1743521598000000000,
"traceEvents": [
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193020828.082, "dur": 546.544,
"args": {
"External id": 513,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 0
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193020842.103, "dur": 499.112,
"args": {
"External id": 514,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 1
}
},
{
"ph": "f", "id": 1, "pid": 2337800, "tid": 2340515, "ts": 6071193020842.103,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515,
"ts": 6071193021150.788, "dur": 76.175,
"args": {
"External id": 515,"Record function id": 0, "Ev Idx": 2
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515,
"ts": 6071193021242.066, "dur": 13.891,
"args": {
"External id": 516,"Record function id": 0, "Ev Idx": 3
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515,
"ts": 6071193021284.670, "dur": 25.589,
"args": {
"External id": 517,"Record function id": 0, "Ev Idx": 4
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193021395.046, "dur": 27.652,
"args": {
"External id": 518,"Record function id": 0, "Ev Idx": 5
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193021398.151, "dur": 22.033,
"args": {
"External id": 519,"Record function id": 0, "Ev Idx": 6
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193021407.796, "dur": 10.846,
"args": {
"External id": 520,"Record function id": 0, "Ev Idx": 7
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193021410.149, "dur": 8.333,
"args": {
"External id": 521,"Record function id": 0, "Ev Idx": 8
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193021427.676, "dur": 4.216,
"args": {
"External id": 522,"Record function id": 0, "Ev Idx": 9
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193021429.148, "dur": 1.973,
"args": {
"External id": 523,"Record function id": 0, "Ev Idx": 10
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193021429.809, "dur": 0.961,
"args": {
"External id": 524,"Record function id": 0, "Ev Idx": 11
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193021430.039, "dur": 0.611,
"args": {
"External id": 525,"Record function id": 0, "Ev Idx": 12
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193021436.669, "dur": 80152.041,
"args": {
"External id": 526,"Record function id": 0, "Ev Idx": 13
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515,
"ts": 6071193021441.957, "dur": 86.551,
"args": {
"External id": 527,"Record function id": 0, "Ev Idx": 14
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515,
"ts": 6071193021450.019, "dur": 24.497,
"args": {
"External id": 528,"Record function id": 0, "Ev Idx": 15
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515,
"ts": 6071193021454.196, "dur": 19.779,
"args": {
"External id": 529,"Record function id": 0, "Ev Idx": 16
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193021485.723, "dur": 42.444,
"args": {
"External id": 530,"Record function id": 0, "Ev Idx": 17
}
},
{
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193021529.269, "dur": 80010.297,
"args": {
"External id": 531,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 18
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515,
"ts": 6071193021533.505, "dur": 80003.157,
"args": {
"External id": 532,"Record function id": 0, "Ev Idx": 19
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515,
"ts": 6071193021536.900, "dur": 79997.639,
"args": {
"External id": 533,"Record function id": 0, "Ev Idx": 20
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515,
"ts": 6071193021544.382, "dur": 25.839,
"args": {
"External id": 534,"Record function id": 0, "Ev Idx": 21
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193021571.072, "dur": 79951.138,
"args": {
"External id": 535,"Record function id": 0, "Ev Idx": 22
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193107960.067, "dur": 553.505,
"args": {
"External id": 536,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 1, "Ev Idx": 23
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193107968.520, "dur": 521.386,
"args": {
"External id": 537,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 1, "Ev Idx": 24
}
},
{
"ph": "f", "id": 2, "pid": 2337800, "tid": 2340515, "ts": 6071193107968.520,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515,
"ts": 6071193108318.467, "dur": 71.628,
"args": {
"External id": 538,"Record function id": 0, "Ev Idx": 25
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515,
"ts": 6071193108404.808, "dur": 14.241,
"args": {
"External id": 539,"Record function id": 0, "Ev Idx": 26
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515,
"ts": 6071193108446.530, "dur": 11.288,
"args": {
"External id": 540,"Record function id": 0, "Ev Idx": 27
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193108532.600, "dur": 25.449,
"args": {
"External id": 541,"Record function id": 0, "Ev Idx": 28
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193108535.495, "dur": 20.801,
"args": {
"External id": 542,"Record function id": 0, "Ev Idx": 29
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193108545.570, "dur": 9.995,
"args": {
"External id": 543,"Record function id": 0, "Ev Idx": 30
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193108547.823, "dur": 7.441,
"args": {
"External id": 544,"Record function id": 0, "Ev Idx": 31
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193108562.826, "dur": 4.256,
"args": {
"External id": 545,"Record function id": 0, "Ev Idx": 32
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193108564.218, "dur": 2.123,
"args": {
"External id": 546,"Record function id": 0, "Ev Idx": 33
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193108564.969, "dur": 1.052,
"args": {
"External id": 547,"Record function id": 0, "Ev Idx": 34
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193108565.300, "dur": 0.600,
"args": {
"External id": 548,"Record function id": 0, "Ev Idx": 35
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193108571.389, "dur": 82018.265,
"args": {
"External id": 549,"Record function id": 0, "Ev Idx": 36
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515,
"ts": 6071193108575.665, "dur": 71.889,
"args": {
"External id": 550,"Record function id": 0, "Ev Idx": 37
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515,
"ts": 6071193108582.616, "dur": 22.163,
"args": {
"External id": 551,"Record function id": 0, "Ev Idx": 38
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515,
"ts": 6071193108586.161, "dur": 18.087,
"args": {
"External id": 552,"Record function id": 0, "Ev Idx": 39
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193108606.712, "dur": 40.541,
"args": {
"External id": 553,"Record function id": 0, "Ev Idx": 40
}
},
{
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193108648.575, "dur": 81892.586,
"args": {
"External id": 554,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 41
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515,
"ts": 6071193108652.251, "dur": 81885.154,
"args": {
"External id": 555,"Record function id": 0, "Ev Idx": 42
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515,
"ts": 6071193108655.375, "dur": 81879.416,
"args": {
"External id": 556,"Record function id": 0, "Ev Idx": 43
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515,
"ts": 6071193108661.585, "dur": 28.963,
"args": {
"External id": 557,"Record function id": 0, "Ev Idx": 44
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193108691.500, "dur": 81829.570,
"args": {
"External id": 558,"Record function id": 0, "Ev Idx": 45
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193196242.517, "dur": 569.128,
"args": {
"External id": 559,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 1, "Ev Idx": 46
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193196250.239, "dur": 533.464,
"args": {
"External id": 560,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 1, "Ev Idx": 47
}
},
{
"ph": "f", "id": 3, "pid": 2337800, "tid": 2340515, "ts": 6071193196250.239,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515,
"ts": 6071193196607.127, "dur": 75.053,
"args": {
"External id": 561,"Record function id": 0, "Ev Idx": 48
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515,
"ts": 6071193196695.991, "dur": 15.463,
"args": {
"External id": 562,"Record function id": 0, "Ev Idx": 49
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515,
"ts": 6071193196738.926, "dur": 12.569,
"args": {
"External id": 563,"Record function id": 0, "Ev Idx": 50
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193196831.535, "dur": 27.962,
"args": {
"External id": 564,"Record function id": 0, "Ev Idx": 51
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193196835.171, "dur": 22.063,
"args": {
"External id": 565,"Record function id": 0, "Ev Idx": 52
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193196846.017, "dur": 9.825,
"args": {
"External id": 566,"Record function id": 0, "Ev Idx": 53
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193196848.581, "dur": 7.071,
"args": {
"External id": 567,"Record function id": 0, "Ev Idx": 54
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193196865.577, "dur": 5.658,
"args": {
"External id": 568,"Record function id": 0, "Ev Idx": 55
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193196867.419, "dur": 2.774,
"args": {
"External id": 569,"Record function id": 0, "Ev Idx": 56
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193196868.571, "dur": 1.172,
"args": {
"External id": 570,"Record function id": 0, "Ev Idx": 57
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193196868.982, "dur": 0.611,
"args": {
"External id": 571,"Record function id": 0, "Ev Idx": 58
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193196877.104, "dur": 81367.083,
"args": {
"External id": 572,"Record function id": 0, "Ev Idx": 59
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515,
"ts": 6071193196882.682, "dur": 82.334,
"args": {
"External id": 573,"Record function id": 0, "Ev Idx": 60
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515,
"ts": 6071193196890.714, "dur": 26.240,
"args": {
"External id": 574,"Record function id": 0, "Ev Idx": 61
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515,
"ts": 6071193196895.231, "dur": 21.042,
"args": {
"External id": 575,"Record function id": 0, "Ev Idx": 62
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193196919.918, "dur": 44.738,
"args": {
"External id": 576,"Record function id": 0, "Ev Idx": 63
}
},
{
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193196966.108, "dur": 81234.904,
"args": {
"External id": 577,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 64
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515,
"ts": 6071193196970.244, "dur": 81228.715,
"args": {
"External id": 578,"Record function id": 0, "Ev Idx": 65
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515,
"ts": 6071193196973.880, "dur": 81223.527,
"args": {
"External id": 579,"Record function id": 0, "Ev Idx": 66
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515,
"ts": 6071193196980.971, "dur": 36.935,
"args": {
"External id": 580,"Record function id": 0, "Ev Idx": 67
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193197018.888, "dur": 81169.024,
"args": {
"External id": 581,"Record function id": 0, "Ev Idx": 68
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193283965.243, "dur": 573.966,
"args": {
"External id": 582,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 1, "Ev Idx": 69
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193283974.738, "dur": 537.941,
"args": {
"External id": 583,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 1, "Ev Idx": 70
}
},
{
"ph": "f", "id": 4, "pid": 2337800, "tid": 2340515, "ts": 6071193283974.738,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515,
"ts": 6071193284333.498, "dur": 75.374,
"args": {
"External id": 584,"Record function id": 0, "Ev Idx": 71
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515,
"ts": 6071193284423.855, "dur": 14.452,
"args": {
"External id": 585,"Record function id": 0, "Ev Idx": 72
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515,
"ts": 6071193284465.898, "dur": 14.342,
"args": {
"External id": 586,"Record function id": 0, "Ev Idx": 73
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193284559.229, "dur": 29.905,
"args": {
"External id": 587,"Record function id": 0, "Ev Idx": 74
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193284562.854, "dur": 23.946,
"args": {
"External id": 588,"Record function id": 0, "Ev Idx": 75
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193284573.861, "dur": 11.657,
"args": {
"External id": 589,"Record function id": 0, "Ev Idx": 76
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193284576.455, "dur": 8.883,
"args": {
"External id": 590,"Record function id": 0, "Ev Idx": 77
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193284594.913, "dur": 5.388,
"args": {
"External id": 591,"Record function id": 0, "Ev Idx": 78
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193284596.685, "dur": 2.674,
"args": {
"External id": 592,"Record function id": 0, "Ev Idx": 79
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193284597.657, "dur": 1.252,
"args": {
"External id": 593,"Record function id": 0, "Ev Idx": 80
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193284597.957, "dur": 0.791,
"args": {
"External id": 594,"Record function id": 0, "Ev Idx": 81
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193284606.260, "dur": 79427.378,
"args": {
"External id": 595,"Record function id": 0, "Ev Idx": 82
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515,
"ts": 6071193284612.259, "dur": 86.019,
"args": {
"External id": 596,"Record function id": 0, "Ev Idx": 83
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515,
"ts": 6071193284619.910, "dur": 28.884,
"args": {
"External id": 597,"Record function id": 0, "Ev Idx": 84
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515,
"ts": 6071193284624.467, "dur": 23.776,
"args": {
"External id": 598,"Record function id": 0, "Ev Idx": 85
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193284651.738, "dur": 46.070,
"args": {
"External id": 599,"Record function id": 0, "Ev Idx": 86
}
},
{
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193284699.450, "dur": 79288.820,
"args": {
"External id": 600,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 87
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515,
"ts": 6071193284703.086, "dur": 79282.720,
"args": {
"External id": 601,"Record function id": 0, "Ev Idx": 88
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515,
"ts": 6071193284706.851, "dur": 79276.772,
"args": {
"External id": 602,"Record function id": 0, "Ev Idx": 89
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515,
"ts": 6071193284713.822, "dur": 36.485,
"args": {
"External id": 603,"Record function id": 0, "Ev Idx": 90
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193284751.819, "dur": 79220.237,
"args": {
"External id": 604,"Record function id": 0, "Ev Idx": 91
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193370507.930, "dur": 524.110,
"args": {
"External id": 605,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 1, "Ev Idx": 92
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2337800, "tid": 2340515,
"ts": 6071193370517.485, "dur": 489.838,
"args": {
"External id": 606,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 1, "Ev Idx": 93
}
},
{
"ph": "f", "id": 5, "pid": 2337800, "tid": 2340515, "ts": 6071193370517.485,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2337800, "tid": 2340515,
"ts": 6071193370845.729, "dur": 65.529,
"args": {
"External id": 607,"Record function id": 0, "Ev Idx": 94
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2337800, "tid": 2340515,
"ts": 6071193370925.510, "dur": 13.921,
"args": {
"External id": 608,"Record function id": 0, "Ev Idx": 95
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2340515,
"ts": 6071193370966.001, "dur": 12.038,
"args": {
"External id": 609,"Record function id": 0, "Ev Idx": 96
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193371050.979, "dur": 24.627,
"args": {
"External id": 610,"Record function id": 0, "Ev Idx": 97
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193371054.304, "dur": 19.600,
"args": {
"External id": 611,"Record function id": 0, "Ev Idx": 98
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193371063.057, "dur": 9.755,
"args": {
"External id": 612,"Record function id": 0, "Ev Idx": 99
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193371065.281, "dur": 7.260,
"args": {
"External id": 613,"Record function id": 0, "Ev Idx": 100
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193371079.923, "dur": 4.356,
"args": {
"External id": 614,"Record function id": 0, "Ev Idx": 101
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2340515,
"ts": 6071193371081.435, "dur": 2.093,
"args": {
"External id": 615,"Record function id": 0, "Ev Idx": 102
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193371082.126, "dur": 1.072,
"args": {
"External id": 616,"Record function id": 0, "Ev Idx": 103
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2337800, "tid": 2340515,
"ts": 6071193371082.466, "dur": 0.621,
"args": {
"External id": 617,"Record function id": 0, "Ev Idx": 104
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193371088.656, "dur": 81125.689,
"args": {
"External id": 618,"Record function id": 0, "Ev Idx": 105
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 2337800, "tid": 2340515,
"ts": 6071193371093.363, "dur": 70.456,
"args": {
"External id": 619,"Record function id": 0, "Ev Idx": 106
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2340515,
"ts": 6071193371100.604, "dur": 22.854,
"args": {
"External id": 620,"Record function id": 0, "Ev Idx": 107
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 2337800, "tid": 2340515,
"ts": 6071193371104.570, "dur": 18.337,
"args": {
"External id": 621,"Record function id": 0, "Ev Idx": 108
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193371125.461, "dur": 38.008,
"args": {
"External id": 622,"Record function id": 0, "Ev Idx": 109
}
},
{
"ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 2337800, "tid": 2340515,
"ts": 6071193371164.580, "dur": 81005.348,
"args": {
"External id": 623,"Record function id": 0, "Sequence number": 0, "Fwd thread id": 1, "Ev Idx": 110
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 2337800, "tid": 2340515,
"ts": 6071193371167.895, "dur": 80999.349,
"args": {
"External id": 624,"Record function id": 0, "Ev Idx": 111
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 2337800, "tid": 2340515,
"ts": 6071193371171.090, "dur": 80994.251,
"args": {
"External id": 625,"Record function id": 0, "Ev Idx": 112
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2340515,
"ts": 6071193371177.279, "dur": 34.182,
"args": {
"External id": 626,"Record function id": 0, "Ev Idx": 113
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 2337800, "tid": 2340515,
"ts": 6071193371212.412, "dur": 80942.073,
"args": {
"External id": 627,"Record function id": 0, "Ev Idx": 114
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800,
"ts": 6071193017607.826, "dur": 83.917,
"args": {
"External id": 1,"Record function id": 0, "Ev Idx": 115
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800,
"ts": 6071193017695.438, "dur": 88723.065,
"args": {
"External id": 2,"Record function id": 0, "Ev Idx": 116
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800,
"ts": 6071193017718.353, "dur": 6.940,
"args": {
"External id": 3,"Record function id": 0, "Ev Idx": 117
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800,
"ts": 6071193017821.979, "dur": 2651.638,
"args": {
"External id": 4,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 0, "Ev Idx": 118
}
},
{
"ph": "s", "id": 1, "pid": 2337800, "tid": 2337800, "ts": 6071193017821.979,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800,
"ts": 6071193019656.365, "dur": 66.140,
"args": {
"External id": 5,"Record function id": 0, "Ev Idx": 119
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800,
"ts": 6071193019740.922, "dur": 14.382,
"args": {
"External id": 6,"Record function id": 0, "Ev Idx": 120
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800,
"ts": 6071193019775.545, "dur": 15.273,
"args": {
"External id": 7,"Record function id": 0, "Ev Idx": 121
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193019827.603, "dur": 56.625,
"args": {
"External id": 8,"Record function id": 0, "Ev Idx": 122
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193019840.763, "dur": 38.909,
"args": {
"External id": 9,"Record function id": 0, "Ev Idx": 123
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193019851.068, "dur": 22.675,
"args": {
"External id": 10,"Record function id": 0, "Ev Idx": 124
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193019862.145, "dur": 11.157,
"args": {
"External id": 11,"Record function id": 0, "Ev Idx": 125
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193019866.231, "dur": 4.617,
"args": {
"External id": 12,"Record function id": 0, "Ev Idx": 126
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193019897.228, "dur": 3.415,
"args": {
"External id": 13,"Record function id": 0, "Ev Idx": 127
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193019897.619, "dur": 2.724,
"args": {
"External id": 14,"Record function id": 0, "Ev Idx": 128
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193019898.149, "dur": 1.513,
"args": {
"External id": 15,"Record function id": 0, "Ev Idx": 129
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193019898.530, "dur": 0.961,
"args": {
"External id": 16,"Record function id": 0, "Ev Idx": 130
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193019898.911, "dur": 0.360,
"args": {
"External id": 17,"Record function id": 0, "Ev Idx": 131
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193019904.259, "dur": 2.724,
"args": {
"External id": 18,"Record function id": 0, "Ev Idx": 132
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193019904.569, "dur": 2.193,
"args": {
"External id": 19,"Record function id": 0, "Ev Idx": 133
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193019904.960, "dur": 1.232,
"args": {
"External id": 20,"Record function id": 0, "Ev Idx": 134
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193019905.330, "dur": 0.731,
"args": {
"External id": 21,"Record function id": 0, "Ev Idx": 135
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193019905.641, "dur": 0.250,
"args": {
"External id": 22,"Record function id": 0, "Ev Idx": 136
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193019910.178, "dur": 2.533,
"args": {
"External id": 23,"Record function id": 0, "Ev Idx": 137
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193019910.468, "dur": 1.993,
"args": {
"External id": 24,"Record function id": 0, "Ev Idx": 138
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193019910.829, "dur": 1.101,
"args": {
"External id": 25,"Record function id": 0, "Ev Idx": 139
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193019911.149, "dur": 0.661,
"args": {
"External id": 26,"Record function id": 0, "Ev Idx": 140
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193019911.419, "dur": 0.221,
"args": {
"External id": 27,"Record function id": 0, "Ev Idx": 141
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800,
"ts": 6071193019928.565, "dur": 14.602,
"args": {
"External id": 28,"Record function id": 0, "Ev Idx": 142
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800,
"ts": 6071193019955.856, "dur": 11.418,
"args": {
"External id": 29,"Record function id": 0, "Ev Idx": 143
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800,
"ts": 6071193019982.527, "dur": 10.976,
"args": {
"External id": 30,"Record function id": 0, "Ev Idx": 144
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193020026.663, "dur": 91.368,
"args": {
"External id": 31,"Record function id": 0, "Ev Idx": 145
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800,
"ts": 6071193020139.233, "dur": 22.063,
"args": {
"External id": 32,"Record function id": 0, "Ev Idx": 146
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193020169.759, "dur": 32.349,
"args": {
"External id": 33,"Record function id": 0, "Ev Idx": 147
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800,
"ts": 6071193020216.610, "dur": 14.722,
"args": {
"External id": 34,"Record function id": 0, "Ev Idx": 148
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800,
"ts": 6071193020243.059, "dur": 86.631,
"args": {
"External id": 35,"Record function id": 0, "Ev Idx": 149
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800,
"ts": 6071193020356.551, "dur": 14.391,
"args": {
"External id": 36,"Record function id": 0, "Ev Idx": 150
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800,
"ts": 6071193020383.722, "dur": 13.440,
"args": {
"External id": 37,"Record function id": 0, "Ev Idx": 151
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193020559.426, "dur": 46.070,
"args": {
"External id": 38,"Record function id": 0, "Ev Idx": 152
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193020562.822, "dur": 16.885,
"args": {
"External id": 39,"Record function id": 0, "Ev Idx": 153
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193020569.862, "dur": 9.024,
"args": {
"External id": 40,"Record function id": 0, "Ev Idx": 154
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800,
"ts": 6071193020584.935, "dur": 20.311,
"args": {
"External id": 41,"Record function id": 0, "Ev Idx": 155
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193101675.772, "dur": 4674.498,
"args": {
"External id": 42,"Record function id": 0, "Ev Idx": 156
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193101688.501, "dur": 4658.734,
"args": {
"External id": 43,"Record function id": 0, "Ev Idx": 157
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800,
"ts": 6071193101699.047, "dur": 1083.914,
"args": {
"External id": 44,"Record function id": 0, "Ev Idx": 158
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800,
"ts": 6071193106523.631, "dur": 87.332,
"args": {
"External id": 45,"Record function id": 0, "Ev Idx": 159
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800,
"ts": 6071193106612.255, "dur": 88007.936,
"args": {
"External id": 46,"Record function id": 0, "Ev Idx": 160
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800,
"ts": 6071193106634.038, "dur": 7.010,
"args": {
"External id": 47,"Record function id": 0, "Ev Idx": 161
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800,
"ts": 6071193106736.943, "dur": 994.800,
"args": {
"External id": 48,"Record function id": 0, "Sequence number": 135, "Fwd thread id": 0, "Ev Idx": 162
}
},
{
"ph": "s", "id": 2, "pid": 2337800, "tid": 2337800, "ts": 6071193106736.943,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800,
"ts": 6071193106896.774, "dur": 86.300,
"args": {
"External id": 49,"Record function id": 0, "Ev Idx": 163
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800,
"ts": 6071193107000.660, "dur": 16.385,
"args": {
"External id": 50,"Record function id": 0, "Ev Idx": 164
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800,
"ts": 6071193107033.630, "dur": 17.196,
"args": {
"External id": 51,"Record function id": 0, "Ev Idx": 165
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193107083.335, "dur": 39.510,
"args": {
"External id": 52,"Record function id": 0, "Ev Idx": 166
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193107090.746, "dur": 30.616,
"args": {
"External id": 53,"Record function id": 0, "Ev Idx": 167
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193107097.567, "dur": 19.469,
"args": {
"External id": 54,"Record function id": 0, "Ev Idx": 168
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193107105.478, "dur": 11.087,
"args": {
"External id": 55,"Record function id": 0, "Ev Idx": 169
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193107110.095, "dur": 4.728,
"args": {
"External id": 56,"Record function id": 0, "Ev Idx": 170
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193107135.013, "dur": 4.347,
"args": {
"External id": 57,"Record function id": 0, "Ev Idx": 171
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193107135.544, "dur": 3.485,
"args": {
"External id": 58,"Record function id": 0, "Ev Idx": 172
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193107136.105, "dur": 2.133,
"args": {
"External id": 59,"Record function id": 0, "Ev Idx": 173
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193107136.605, "dur": 1.413,
"args": {
"External id": 60,"Record function id": 0, "Ev Idx": 174
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193107137.096, "dur": 0.631,
"args": {
"External id": 61,"Record function id": 0, "Ev Idx": 175
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193107143.296, "dur": 4.096,
"args": {
"External id": 62,"Record function id": 0, "Ev Idx": 176
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193107143.776, "dur": 3.315,
"args": {
"External id": 63,"Record function id": 0, "Ev Idx": 177
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193107144.267, "dur": 1.923,
"args": {
"External id": 64,"Record function id": 0, "Ev Idx": 178
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193107144.758, "dur": 1.242,
"args": {
"External id": 65,"Record function id": 0, "Ev Idx": 179
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193107145.248, "dur": 0.511,
"args": {
"External id": 66,"Record function id": 0, "Ev Idx": 180
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193107151.207, "dur": 3.996,
"args": {
"External id": 67,"Record function id": 0, "Ev Idx": 181
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193107151.718, "dur": 3.195,
"args": {
"External id": 68,"Record function id": 0, "Ev Idx": 182
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193107152.199, "dur": 1.753,
"args": {
"External id": 69,"Record function id": 0, "Ev Idx": 183
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193107152.680, "dur": 1.081,
"args": {
"External id": 70,"Record function id": 0, "Ev Idx": 184
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193107153.050, "dur": 0.471,
"args": {
"External id": 71,"Record function id": 0, "Ev Idx": 185
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800,
"ts": 6071193107171.638, "dur": 14.742,
"args": {
"External id": 72,"Record function id": 0, "Ev Idx": 186
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800,
"ts": 6071193107200.472, "dur": 12.889,
"args": {
"External id": 73,"Record function id": 0, "Ev Idx": 187
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800,
"ts": 6071193107224.388, "dur": 13.981,
"args": {
"External id": 74,"Record function id": 0, "Ev Idx": 188
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193107268.985, "dur": 114.993,
"args": {
"External id": 75,"Record function id": 0, "Ev Idx": 189
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800,
"ts": 6071193107408.085, "dur": 28.923,
"args": {
"External id": 76,"Record function id": 0, "Ev Idx": 190
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193107445.591, "dur": 23.846,
"args": {
"External id": 77,"Record function id": 0, "Ev Idx": 191
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800,
"ts": 6071193107484.810, "dur": 17.086,
"args": {
"External id": 78,"Record function id": 0, "Ev Idx": 192
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800,
"ts": 6071193107515.096, "dur": 74.292,
"args": {
"External id": 79,"Record function id": 0, "Ev Idx": 193
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800,
"ts": 6071193107615.187, "dur": 16.355,
"args": {
"External id": 80,"Record function id": 0, "Ev Idx": 194
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800,
"ts": 6071193107645.212, "dur": 15.103,
"args": {
"External id": 81,"Record function id": 0, "Ev Idx": 195
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193107797.772, "dur": 41.903,
"args": {
"External id": 82,"Record function id": 0, "Ev Idx": 196
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193107800.917, "dur": 13.520,
"args": {
"External id": 83,"Record function id": 0, "Ev Idx": 197
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193107805.103, "dur": 8.533,
"args": {
"External id": 84,"Record function id": 0, "Ev Idx": 198
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800,
"ts": 6071193107816.611, "dur": 22.794,
"args": {
"External id": 85,"Record function id": 0, "Ev Idx": 199
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193190696.625, "dur": 3846.109,
"args": {
"External id": 86,"Record function id": 0, "Ev Idx": 200
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193190711.307, "dur": 3825.438,
"args": {
"External id": 87,"Record function id": 0, "Ev Idx": 201
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800,
"ts": 6071193190722.264, "dur": 998.035,
"args": {
"External id": 88,"Record function id": 0, "Ev Idx": 202
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800,
"ts": 6071193194731.008, "dur": 88.954,
"args": {
"External id": 89,"Record function id": 0, "Ev Idx": 203
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800,
"ts": 6071193194821.134, "dur": 87447.752,
"args": {
"External id": 90,"Record function id": 0, "Ev Idx": 204
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800,
"ts": 6071193194842.516, "dur": 5.579,
"args": {
"External id": 91,"Record function id": 0, "Ev Idx": 205
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800,
"ts": 6071193194945.892, "dur": 1057.334,
"args": {
"External id": 92,"Record function id": 0, "Sequence number": 136, "Fwd thread id": 0, "Ev Idx": 206
}
},
{
"ph": "s", "id": 3, "pid": 2337800, "tid": 2337800, "ts": 6071193194945.892,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800,
"ts": 6071193195113.134, "dur": 93.872,
"args": {
"External id": 93,"Record function id": 0, "Ev Idx": 207
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800,
"ts": 6071193195226.846, "dur": 16.655,
"args": {
"External id": 94,"Record function id": 0, "Ev Idx": 208
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800,
"ts": 6071193195262.990, "dur": 17.627,
"args": {
"External id": 95,"Record function id": 0, "Ev Idx": 209
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193195339.005, "dur": 46.880,
"args": {
"External id": 96,"Record function id": 0, "Ev Idx": 210
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193195347.387, "dur": 36.185,
"args": {
"External id": 97,"Record function id": 0, "Ev Idx": 211
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193195356.261, "dur": 22.564,
"args": {
"External id": 98,"Record function id": 0, "Ev Idx": 212
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193195365.445, "dur": 12.919,
"args": {
"External id": 99,"Record function id": 0, "Ev Idx": 213
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193195370.562, "dur": 5.188,
"args": {
"External id": 100,"Record function id": 0, "Ev Idx": 214
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193195398.605, "dur": 5.117,
"args": {
"External id": 101,"Record function id": 0, "Ev Idx": 215
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193195399.296, "dur": 4.096,
"args": {
"External id": 102,"Record function id": 0, "Ev Idx": 216
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193195400.097, "dur": 2.384,
"args": {
"External id": 103,"Record function id": 0, "Ev Idx": 217
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193195400.798, "dur": 1.442,
"args": {
"External id": 104,"Record function id": 0, "Ev Idx": 218
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193195401.449, "dur": 0.511,
"args": {
"External id": 105,"Record function id": 0, "Ev Idx": 219
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193195408.179, "dur": 4.287,
"args": {
"External id": 106,"Record function id": 0, "Ev Idx": 220
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193195408.620, "dur": 3.525,
"args": {
"External id": 107,"Record function id": 0, "Ev Idx": 221
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193195409.161, "dur": 2.033,
"args": {
"External id": 108,"Record function id": 0, "Ev Idx": 222
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193195409.761, "dur": 1.232,
"args": {
"External id": 109,"Record function id": 0, "Ev Idx": 223
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193195410.402, "dur": 0.351,
"args": {
"External id": 110,"Record function id": 0, "Ev Idx": 224
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193195416.281, "dur": 4.397,
"args": {
"External id": 111,"Record function id": 0, "Ev Idx": 225
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193195416.692, "dur": 3.696,
"args": {
"External id": 112,"Record function id": 0, "Ev Idx": 226
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193195417.283, "dur": 2.313,
"args": {
"External id": 113,"Record function id": 0, "Ev Idx": 227
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193195417.844, "dur": 1.552,
"args": {
"External id": 114,"Record function id": 0, "Ev Idx": 228
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193195418.284, "dur": 0.872,
"args": {
"External id": 115,"Record function id": 0, "Ev Idx": 229
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800,
"ts": 6071193195437.303, "dur": 16.134,
"args": {
"External id": 116,"Record function id": 0, "Ev Idx": 230
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800,
"ts": 6071193195467.849, "dur": 13.130,
"args": {
"External id": 117,"Record function id": 0, "Ev Idx": 231
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800,
"ts": 6071193195493.978, "dur": 12.169,
"args": {
"External id": 118,"Record function id": 0, "Ev Idx": 232
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193195540.589, "dur": 102.584,
"args": {
"External id": 119,"Record function id": 0, "Ev Idx": 233
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800,
"ts": 6071193195667.590, "dur": 25.889,
"args": {
"External id": 120,"Record function id": 0, "Ev Idx": 234
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193195703.364, "dur": 24.467,
"args": {
"External id": 121,"Record function id": 0, "Ev Idx": 235
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800,
"ts": 6071193195743.795, "dur": 17.096,
"args": {
"External id": 122,"Record function id": 0, "Ev Idx": 236
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800,
"ts": 6071193195773.600, "dur": 82.114,
"args": {
"External id": 123,"Record function id": 0, "Ev Idx": 237
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800,
"ts": 6071193195883.726, "dur": 15.744,
"args": {
"External id": 124,"Record function id": 0, "Ev Idx": 238
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800,
"ts": 6071193195913.241, "dur": 13.730,
"args": {
"External id": 125,"Record function id": 0, "Ev Idx": 239
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193196076.567, "dur": 44.106,
"args": {
"External id": 126,"Record function id": 0, "Ev Idx": 240
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193196080.272, "dur": 14.172,
"args": {
"External id": 127,"Record function id": 0, "Ev Idx": 241
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193196084.899, "dur": 8.894,
"args": {
"External id": 128,"Record function id": 0, "Ev Idx": 242
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800,
"ts": 6071193196097.759, "dur": 22.634,
"args": {
"External id": 129,"Record function id": 0, "Ev Idx": 243
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193278361.244, "dur": 3831.447,
"args": {
"External id": 130,"Record function id": 0, "Ev Idx": 244
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193278375.135, "dur": 3814.511,
"args": {
"External id": 131,"Record function id": 0, "Ev Idx": 245
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800,
"ts": 6071193278386.191, "dur": 849.301,
"args": {
"External id": 132,"Record function id": 0, "Ev Idx": 246
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800,
"ts": 6071193282406.223, "dur": 89.324,
"args": {
"External id": 133,"Record function id": 0, "Ev Idx": 247
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800,
"ts": 6071193282496.819, "dur": 86416.978,
"args": {
"External id": 134,"Record function id": 0, "Ev Idx": 248
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800,
"ts": 6071193282519.714, "dur": 6.109,
"args": {
"External id": 135,"Record function id": 0, "Ev Idx": 249
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800,
"ts": 6071193282626.485, "dur": 1073.789,
"args": {
"External id": 136,"Record function id": 0, "Sequence number": 137, "Fwd thread id": 0, "Ev Idx": 250
}
},
{
"ph": "s", "id": 4, "pid": 2337800, "tid": 2337800, "ts": 6071193282626.485,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800,
"ts": 6071193282794.819, "dur": 92.048,
"args": {
"External id": 137,"Record function id": 0, "Ev Idx": 251
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800,
"ts": 6071193282905.906, "dur": 15.884,
"args": {
"External id": 138,"Record function id": 0, "Ev Idx": 252
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800,
"ts": 6071193282939.647, "dur": 16.495,
"args": {
"External id": 139,"Record function id": 0, "Ev Idx": 253
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193282992.006, "dur": 47.051,
"args": {
"External id": 140,"Record function id": 0, "Ev Idx": 254
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193283000.048, "dur": 36.124,
"args": {
"External id": 141,"Record function id": 0, "Ev Idx": 255
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193283008.961, "dur": 22.454,
"args": {
"External id": 142,"Record function id": 0, "Ev Idx": 256
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193283017.975, "dur": 12.949,
"args": {
"External id": 143,"Record function id": 0, "Ev Idx": 257
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193283022.993, "dur": 5.558,
"args": {
"External id": 144,"Record function id": 0, "Ev Idx": 258
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193283050.975, "dur": 4.827,
"args": {
"External id": 145,"Record function id": 0, "Ev Idx": 259
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193283051.536, "dur": 3.936,
"args": {
"External id": 146,"Record function id": 0, "Ev Idx": 260
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193283052.297, "dur": 2.453,
"args": {
"External id": 147,"Record function id": 0, "Ev Idx": 261
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193283052.938, "dur": 1.582,
"args": {
"External id": 148,"Record function id": 0, "Ev Idx": 262
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193283053.448, "dur": 0.782,
"args": {
"External id": 149,"Record function id": 0, "Ev Idx": 263
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193283059.738, "dur": 4.116,
"args": {
"External id": 150,"Record function id": 0, "Ev Idx": 264
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193283060.149, "dur": 3.395,
"args": {
"External id": 151,"Record function id": 0, "Ev Idx": 265
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193283060.659, "dur": 2.104,
"args": {
"External id": 152,"Record function id": 0, "Ev Idx": 266
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193283061.160, "dur": 1.402,
"args": {
"External id": 153,"Record function id": 0, "Ev Idx": 267
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193283061.781, "dur": 0.541,
"args": {
"External id": 154,"Record function id": 0, "Ev Idx": 268
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193283067.550, "dur": 3.815,
"args": {
"External id": 155,"Record function id": 0, "Ev Idx": 269
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193283067.970, "dur": 3.105,
"args": {
"External id": 156,"Record function id": 0, "Ev Idx": 270
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193283068.441, "dur": 1.843,
"args": {
"External id": 157,"Record function id": 0, "Ev Idx": 271
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193283068.932, "dur": 1.162,
"args": {
"External id": 158,"Record function id": 0, "Ev Idx": 272
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193283069.302, "dur": 0.561,
"args": {
"External id": 159,"Record function id": 0, "Ev Idx": 273
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800,
"ts": 6071193283088.071, "dur": 16.565,
"args": {
"External id": 160,"Record function id": 0, "Ev Idx": 274
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800,
"ts": 6071193283119.047, "dur": 12.960,
"args": {
"External id": 161,"Record function id": 0, "Ev Idx": 275
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800,
"ts": 6071193283143.945, "dur": 14.241,
"args": {
"External id": 162,"Record function id": 0, "Ev Idx": 276
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193283195.402, "dur": 131.118,
"args": {
"External id": 163,"Record function id": 0, "Ev Idx": 277
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800,
"ts": 6071193283353.491, "dur": 28.052,
"args": {
"External id": 164,"Record function id": 0, "Ev Idx": 278
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193283391.318, "dur": 26.530,
"args": {
"External id": 165,"Record function id": 0, "Ev Idx": 279
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800,
"ts": 6071193283434.112, "dur": 18.008,
"args": {
"External id": 166,"Record function id": 0, "Ev Idx": 280
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800,
"ts": 6071193283465.309, "dur": 81.624,
"args": {
"External id": 167,"Record function id": 0, "Ev Idx": 281
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800,
"ts": 6071193283574.985, "dur": 16.024,
"args": {
"External id": 168,"Record function id": 0, "Ev Idx": 282
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800,
"ts": 6071193283605.992, "dur": 14.702,
"args": {
"External id": 169,"Record function id": 0, "Ev Idx": 283
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193283784.210, "dur": 44.007,
"args": {
"External id": 170,"Record function id": 0, "Ev Idx": 284
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193283787.545, "dur": 13.841,
"args": {
"External id": 171,"Record function id": 0, "Ev Idx": 285
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193283791.912, "dur": 8.863,
"args": {
"External id": 172,"Record function id": 0, "Ev Idx": 286
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800,
"ts": 6071193283804.441, "dur": 23.515,
"args": {
"External id": 173,"Record function id": 0, "Ev Idx": 287
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193364119.438, "dur": 4724.754,
"args": {
"External id": 174,"Record function id": 0, "Ev Idx": 288
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193364132.668, "dur": 4708.489,
"args": {
"External id": 175,"Record function id": 0, "Ev Idx": 289
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800,
"ts": 6071193364142.402, "dur": 1031.005,
"args": {
"External id": 176,"Record function id": 0, "Ev Idx": 290
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2337800, "tid": 2337800,
"ts": 6071193369016.151, "dur": 84.287,
"args": {
"External id": 177,"Record function id": 0, "Ev Idx": 291
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2337800, "tid": 2337800,
"ts": 6071193369101.740, "dur": 87260.930,
"args": {
"External id": 178,"Record function id": 0, "Ev Idx": 292
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2337800, "tid": 2337800,
"ts": 6071193369122.772, "dur": 5.338,
"args": {
"External id": 179,"Record function id": 0, "Ev Idx": 293
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2337800, "tid": 2337800,
"ts": 6071193369223.333, "dur": 1025.927,
"args": {
"External id": 180,"Record function id": 0, "Sequence number": 138, "Fwd thread id": 0, "Ev Idx": 294
}
},
{
"ph": "s", "id": 5, "pid": 2337800, "tid": 2337800, "ts": 6071193369223.333,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2337800, "tid": 2337800,
"ts": 6071193369406.630, "dur": 87.572,
"args": {
"External id": 181,"Record function id": 0, "Ev Idx": 295
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2337800, "tid": 2337800,
"ts": 6071193369512.349, "dur": 16.465,
"args": {
"External id": 182,"Record function id": 0, "Ev Idx": 296
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2337800, "tid": 2337800,
"ts": 6071193369546.541, "dur": 17.987,
"args": {
"External id": 183,"Record function id": 0, "Ev Idx": 297
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193369596.626, "dur": 44.848,
"args": {
"External id": 184,"Record function id": 0, "Ev Idx": 298
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193369604.127, "dur": 35.474,
"args": {
"External id": 185,"Record function id": 0, "Ev Idx": 299
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193369612.410, "dur": 22.534,
"args": {
"External id": 186,"Record function id": 0, "Ev Idx": 300
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193369621.023, "dur": 13.450,
"args": {
"External id": 187,"Record function id": 0, "Ev Idx": 301
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193369626.742, "dur": 5.628,
"args": {
"External id": 188,"Record function id": 0, "Ev Idx": 302
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193369654.604, "dur": 4.887,
"args": {
"External id": 189,"Record function id": 0, "Ev Idx": 303
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193369655.104, "dur": 3.646,
"args": {
"External id": 190,"Record function id": 0, "Ev Idx": 304
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193369655.635, "dur": 2.314,
"args": {
"External id": 191,"Record function id": 0, "Ev Idx": 305
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193369656.216, "dur": 1.522,
"args": {
"External id": 192,"Record function id": 0, "Ev Idx": 306
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193369657.017, "dur": 0.431,
"args": {
"External id": 193,"Record function id": 0, "Ev Idx": 307
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193369663.537, "dur": 4.417,
"args": {
"External id": 194,"Record function id": 0, "Ev Idx": 308
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193369664.208, "dur": 3.445,
"args": {
"External id": 195,"Record function id": 0, "Ev Idx": 309
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193369664.719, "dur": 1.983,
"args": {
"External id": 196,"Record function id": 0, "Ev Idx": 310
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193369665.220, "dur": 1.291,
"args": {
"External id": 197,"Record function id": 0, "Ev Idx": 311
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193369665.640, "dur": 0.631,
"args": {
"External id": 198,"Record function id": 0, "Ev Idx": 312
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2337800, "tid": 2337800,
"ts": 6071193369671.669, "dur": 4.096,
"args": {
"External id": 199,"Record function id": 0, "Ev Idx": 313
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2337800, "tid": 2337800,
"ts": 6071193369672.100, "dur": 3.345,
"args": {
"External id": 200,"Record function id": 0, "Ev Idx": 314
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2337800, "tid": 2337800,
"ts": 6071193369672.561, "dur": 1.782,
"args": {
"External id": 201,"Record function id": 0, "Ev Idx": 315
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2337800, "tid": 2337800,
"ts": 6071193369673.051, "dur": 1.122,
"args": {
"External id": 202,"Record function id": 0, "Ev Idx": 316
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193369673.522, "dur": 0.421,
"args": {
"External id": 203,"Record function id": 0, "Ev Idx": 317
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2337800, "tid": 2337800,
"ts": 6071193369693.843, "dur": 16.414,
"args": {
"External id": 204,"Record function id": 0, "Ev Idx": 318
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2337800, "tid": 2337800,
"ts": 6071193369725.390, "dur": 12.639,
"args": {
"External id": 205,"Record function id": 0, "Ev Idx": 319
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2337800, "tid": 2337800,
"ts": 6071193369751.239, "dur": 13.861,
"args": {
"External id": 206,"Record function id": 0, "Ev Idx": 320
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193369797.799, "dur": 96.836,
"args": {
"External id": 207,"Record function id": 0, "Ev Idx": 321
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 2337800, "tid": 2337800,
"ts": 6071193369917.310, "dur": 26.610,
"args": {
"External id": 208,"Record function id": 0, "Ev Idx": 322
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2337800, "tid": 2337800,
"ts": 6071193369953.224, "dur": 24.467,
"args": {
"External id": 209,"Record function id": 0, "Ev Idx": 323
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 2337800, "tid": 2337800,
"ts": 6071193369994.456, "dur": 18.057,
"args": {
"External id": 210,"Record function id": 0, "Ev Idx": 324
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2337800, "tid": 2337800,
"ts": 6071193370024.972, "dur": 76.906,
"args": {
"External id": 211,"Record function id": 0, "Ev Idx": 325
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 2337800, "tid": 2337800,
"ts": 6071193370129.029, "dur": 16.465,
"args": {
"External id": 212,"Record function id": 0, "Ev Idx": 326
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 2337800, "tid": 2337800,
"ts": 6071193370159.345, "dur": 16.695,
"args": {
"External id": 213,"Record function id": 0, "Ev Idx": 327
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193370344.153, "dur": 44.818,
"args": {
"External id": 214,"Record function id": 0, "Ev Idx": 328
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2337800, "tid": 2337800,
"ts": 6071193370347.528, "dur": 15.173,
"args": {
"External id": 215,"Record function id": 0, "Ev Idx": 329
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2337800, "tid": 2337800,
"ts": 6071193370351.915, "dur": 9.985,
"args": {
"External id": 216,"Record function id": 0, "Ev Idx": 330
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2337800, "tid": 2337800,
"ts": 6071193370365.405, "dur": 23.295,
"args": {
"External id": 217,"Record function id": 0, "Ev Idx": 331
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193452316.139, "dur": 3949.765,
"args": {
"External id": 218,"Record function id": 0, "Ev Idx": 332
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2337800, "tid": 2337800,
"ts": 6071193452328.046, "dur": 3933.021,
"args": {
"External id": 219,"Record function id": 0, "Ev Idx": 333
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 2337800, "tid": 2337800,
"ts": 6071193452338.472, "dur": 862.471,
"args": {
"External id": 220,"Record function id": 0, "Ev Idx": 334
}
},
{
"ph": "X", "cat": "overhead", "name": "Unrecognized", "pid": -1, "tid": 0,
"ts": 6071193017878.845, "dur": 1677.870
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6071193019717.866, "dur": 7.104,
"args": {
"External id": 5, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 30, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 30, "pid": 0, "tid": 7, "ts": 6071193019717.866,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019690.266, "dur": 30.726,
"args": {
"External id": 5, "cbid": 307, "correlation": 30
}
},
{
"ph": "s", "id": 30, "pid": 2337800, "tid": 2337800, "ts": 6071193019690.266,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6071193019755.913, "dur": 1.632,
"args": {
"External id": 6, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 38, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 38, "pid": 0, "tid": 7, "ts": 6071193019755.913,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019748.263, "dur": 6.290,
"args": {
"External id": 6, "cbid": 307, "correlation": 38
}
},
{
"ph": "s", "id": 38, "pid": 2337800, "tid": 2337800, "ts": 6071193019748.263,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193019791.785, "dur": 43.776,
"args": {
"External id": 7, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 46, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 46, "pid": 0, "tid": 7, "ts": 6071193019791.785,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019783.827, "dur": 6.390,
"args": {
"External id": 7, "cbid": 307, "correlation": 46
}
},
{
"ph": "s", "id": 46, "pid": 2337800, "tid": 2337800, "ts": 6071193019783.827,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6071193019943.785, "dur": 1.344,
"args": {
"External id": 28, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 53, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 53, "pid": 0, "tid": 7, "ts": 6071193019943.785,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019935.255, "dur": 7.251,
"args": {
"External id": 28, "cbid": 307, "correlation": 53
}
},
{
"ph": "s", "id": 53, "pid": 2337800, "tid": 2337800, "ts": 6071193019935.255,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6071193019968.617, "dur": 32.065,
"args": {
"External id": 29, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 60, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 60, "pid": 0, "tid": 7, "ts": 6071193019968.617,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019961.074, "dur": 5.679,
"args": {
"External id": 29, "cbid": 307, "correlation": 60
}
},
{
"ph": "s", "id": 60, "pid": 2337800, "tid": 2337800, "ts": 6071193019961.074,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6071193020001.609, "dur": 1.440,
"args": {
"External id": 30, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 67, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 67, "pid": 0, "tid": 7, "ts": 6071193020001.609,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193019987.184, "dur": 5.828,
"args": {
"External id": 30, "cbid": 307, "correlation": 67
}
},
{
"ph": "s", "id": 67, "pid": 2337800, "tid": 2337800, "ts": 6071193019987.184,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193020103.890, "dur": 1.692,
"args": {
"External id": 31, "cbid": 200, "correlation": 82
}
},
{
"ph": "f", "id": 82, "pid": 2337800, "tid": 2337800, "ts": 6071193020103.890,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193020117.641, "dur": 26803.006,
"args": {
"External id": 31, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 84, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 84, "pid": 0, "tid": 7, "ts": 6071193020117.641,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020109.328, "dur": 6.420,
"args": {
"External id": 31, "cbid": 307, "correlation": 84
}
},
{
"ph": "s", "id": 84, "pid": 2337800, "tid": 2337800, "ts": 6071193020109.328,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6071193046921.607, "dur": 5544.031,
"args": {
"External id": 32, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 105, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 105, "pid": 0, "tid": 7, "ts": 6071193046921.607,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020153.344, "dur": 7.191,
"args": {
"External id": 32, "cbid": 307, "correlation": 105
}
},
{
"ph": "s", "id": 105, "pid": 2337800, "tid": 2337800, "ts": 6071193020153.344,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193020181.937, "dur": 0.361,
"args": {
"External id": 33, "cbid": 200, "correlation": 118
}
},
{
"ph": "f", "id": 118, "pid": 2337800, "tid": 2337800, "ts": 6071193020181.937,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193052466.502, "dur": 19462.558,
"args": {
"External id": 33, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 121, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 121, "pid": 0, "tid": 7, "ts": 6071193052466.502,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020195.858, "dur": 5.398,
"args": {
"External id": 33, "cbid": 307, "correlation": 121
}
},
{
"ph": "s", "id": 121, "pid": 2337800, "tid": 2337800, "ts": 6071193020195.858,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6071193071930.020, "dur": 1659.200,
"args": {
"External id": 34, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 132, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 132, "pid": 0, "tid": 7, "ts": 6071193071930.020,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020224.592, "dur": 6.079,
"args": {
"External id": 34, "cbid": 307, "correlation": 132
}
},
{
"ph": "s", "id": 132, "pid": 2337800, "tid": 2337800, "ts": 6071193020224.592,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193073590.116, "dur": 75.072,
"args": {
"External id": 35, "device": 0, "context": 1, "stream": 7, "correlation": 139, "bytes": 77194752, "memory bandwidth (GB/s)": 1028.2762148337597
}
},
{
"ph": "f", "id": 139, "pid": 0, "tid": 7, "ts": 6071193073590.116,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800,
"ts": 6071193020270.501, "dur": 21.893,
"args": {
"External id": 35, "cbid": 41, "correlation": 139
}
},
{
"ph": "s", "id": 139, "pid": 2337800, "tid": 2337800, "ts": 6071193020270.501,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193020321.137, "dur": 0.551,
"args": {
"External id": 35, "cbid": 200, "correlation": 150
}
},
{
"ph": "f", "id": 150, "pid": 2337800, "tid": 2337800, "ts": 6071193020321.137,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6071193073666.116, "dur": 5819.263,
"args": {
"External id": 35, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 153, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 153, "pid": 0, "tid": 7, "ts": 6071193073666.116,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020323.641, "dur": 4.707,
"args": {
"External id": 35, "cbid": 307, "correlation": 153
}
},
{
"ph": "s", "id": 153, "pid": 2337800, "tid": 2337800, "ts": 6071193020323.641,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6071193079487.267, "dur": 2.720,
"args": {
"External id": 36, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 165, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 165, "pid": 0, "tid": 7, "ts": 6071193079487.267,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020364.653, "dur": 5.608,
"args": {
"External id": 36, "cbid": 307, "correlation": 165
}
},
{
"ph": "s", "id": 165, "pid": 2337800, "tid": 2337800, "ts": 6071193020364.653,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6071193079490.915, "dur": 1.728,
"args": {
"External id": 37, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 170, "pid": 0, "tid": 7, "ts": 6071193079490.915,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020391.443, "dur": 5.178,
"args": {
"External id": 37, "cbid": 307, "correlation": 170
}
},
{
"ph": "s", "id": 170, "pid": 2337800, "tid": 2337800, "ts": 6071193020391.443,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6071193079493.507, "dur": 1.344,
"args": {
"External id": 41, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 181, "pid": 0, "tid": 7, "ts": 6071193079493.507,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193020595.912, "dur": 8.402,
"args": {
"External id": 41, "cbid": 211, "correlation": 181
}
},
{
"ph": "s", "id": 181, "pid": 2337800, "tid": 2337800, "ts": 6071193020595.912,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800,
"ts": 6071193020652.367, "dur": 2.083,
"args": {
"External id": 2, "cbid": 135, "correlation": 189
}
},
{
"ph": "f", "id": 189, "pid": 2337800, "tid": 2337800, "ts": 6071193020652.367,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6071193079495.811, "dur": 61.088,
"args": {
"External id": 515, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 198, "pid": 0, "tid": 7, "ts": 6071193079495.811,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193021185.510, "dur": 39.610,
"args": {
"External id": 515, "cbid": 307, "correlation": 198
}
},
{
"ph": "s", "id": 198, "pid": 2337800, "tid": 2340515, "ts": 6071193021185.510,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6071193079557.763, "dur": 2.336,
"args": {
"External id": 516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 202, "pid": 0, "tid": 7, "ts": 6071193079557.763,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193021248.836, "dur": 6.510,
"args": {
"External id": 516, "cbid": 307, "correlation": 202
}
},
{
"ph": "s", "id": 202, "pid": 2337800, "tid": 2340515, "ts": 6071193021248.836,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193079561.091, "dur": 47.488,
"args": {
"External id": 517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 206, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 206, "pid": 0, "tid": 7, "ts": 6071193079561.091,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193021291.610, "dur": 17.827,
"args": {
"External id": 517, "cbid": 307, "correlation": 206
}
},
{
"ph": "s", "id": 206, "pid": 2337800, "tid": 2340515, "ts": 6071193021291.610,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193021360.124, "dur": 2.644,
"args": {
"External id": 513, "cbid": 135, "correlation": 211
}
},
{
"ph": "f", "id": 211, "pid": 2337800, "tid": 2340515, "ts": 6071193021360.124,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193021367.445, "dur": 0.420,
"args": {
"External id": 513, "cbid": 135, "correlation": 216
}
},
{
"ph": "f", "id": 216, "pid": 2337800, "tid": 2340515, "ts": 6071193021367.445,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193021370.309, "dur": 0.431,
"args": {
"External id": 513, "cbid": 135, "correlation": 221
}
},
{
"ph": "f", "id": 221, "pid": 2337800, "tid": 2340515, "ts": 6071193021370.309,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193079609.571, "dur": 55.168,
"args": {
"External id": 530, "device": 0, "context": 1, "stream": 7, "correlation": 255, "bytes": 50331648, "memory bandwidth (GB/s)": 912.3341067285382
}
},
{
"ph": "f", "id": 255, "pid": 0, "tid": 7, "ts": 6071193079609.571,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193021504.792, "dur": 20.120,
"args": {
"External id": 530, "cbid": 41, "correlation": 255
}
},
{
"ph": "s", "id": 255, "pid": 2337800, "tid": 2340515, "ts": 6071193021504.792,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7,
"ts": 6071193079671.587, "dur": 20918.622,
"args": {
"External id": 535, "device": 0, "context": 1, "stream": 7, "correlation": 261, "bytes": 50331648, "memory bandwidth (GB/s)": 2.4060690039716763
}
},
{
"ph": "f", "id": 261, "pid": 0, "tid": 7, "ts": 6071193079671.587,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193021573.485, "dur": 79912.230,
"args": {
"External id": 535, "cbid": 41, "correlation": 261
}
},
{
"ph": "s", "id": 261, "pid": 2337800, "tid": 2340515, "ts": 6071193021573.485,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515,
"ts": 6071193101491.093, "dur": 14.462,
"args": {
"External id": 535, "cbid": 131, "correlation": 262
}
},
{
"ph": "s", "id": 262, "pid": 2337800, "tid": 2340515, "ts": 6071193101491.093,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6071193106980.256, "dur": 7.488,
"args": {
"External id": 49, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 300, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 300, "pid": 0, "tid": 7, "ts": 6071193106980.256,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193106938.697, "dur": 43.486,
"args": {
"External id": 49, "cbid": 307, "correlation": 300
}
},
{
"ph": "s", "id": 300, "pid": 2337800, "tid": 2337800, "ts": 6071193106938.697,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6071193107016.736, "dur": 1.664,
"args": {
"External id": 50, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 308, "pid": 0, "tid": 7, "ts": 6071193107016.736,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107008.322, "dur": 7.912,
"args": {
"External id": 50, "cbid": 307, "correlation": 308
}
},
{
"ph": "s", "id": 308, "pid": 2337800, "tid": 2337800, "ts": 6071193107008.322,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193107051.424, "dur": 44.128,
"args": {
"External id": 51, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 316, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 316, "pid": 0, "tid": 7, "ts": 6071193107051.424,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107042.694, "dur": 7.441,
"args": {
"External id": 51, "cbid": 307, "correlation": 316
}
},
{
"ph": "s", "id": 316, "pid": 2337800, "tid": 2337800, "ts": 6071193107042.694,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6071193107187.168, "dur": 1.376,
"args": {
"External id": 72, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 323, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 323, "pid": 0, "tid": 7, "ts": 6071193107187.168,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107177.878, "dur": 7.912,
"args": {
"External id": 72, "cbid": 307, "correlation": 323
}
},
{
"ph": "s", "id": 323, "pid": 2337800, "tid": 2337800, "ts": 6071193107177.878,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6071193107214.240, "dur": 32.256,
"args": {
"External id": 73, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 330, "pid": 0, "tid": 7, "ts": 6071193107214.240,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107206.431, "dur": 6.319,
"args": {
"External id": 73, "cbid": 307, "correlation": 330
}
},
{
"ph": "s", "id": 330, "pid": 2337800, "tid": 2337800, "ts": 6071193107206.431,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6071193107247.392, "dur": 1.440,
"args": {
"External id": 74, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 337, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 337, "pid": 0, "tid": 7, "ts": 6071193107247.392,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107230.587, "dur": 7.331,
"args": {
"External id": 74, "cbid": 307, "correlation": 337
}
},
{
"ph": "s", "id": 337, "pid": 2337800, "tid": 2337800, "ts": 6071193107230.587,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193107368.966, "dur": 2.143,
"args": {
"External id": 75, "cbid": 200, "correlation": 352
}
},
{
"ph": "f", "id": 352, "pid": 2337800, "tid": 2337800, "ts": 6071193107368.966,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193107384.320, "dur": 27572.893,
"args": {
"External id": 75, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 354, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 354, "pid": 0, "tid": 7, "ts": 6071193107384.320,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107374.654, "dur": 7.331,
"args": {
"External id": 75, "cbid": 307, "correlation": 354
}
},
{
"ph": "s", "id": 354, "pid": 2337800, "tid": 2337800, "ts": 6071193107374.654,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6071193134958.237, "dur": 5611.168,
"args": {
"External id": 76, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 375, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 375, "pid": 0, "tid": 7, "ts": 6071193134958.237,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107427.794, "dur": 8.333,
"args": {
"External id": 76, "cbid": 307, "correlation": 375
}
},
{
"ph": "s", "id": 375, "pid": 2337800, "tid": 2337800, "ts": 6071193107427.794,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193107458.561, "dur": 0.441,
"args": {
"External id": 77, "cbid": 200, "correlation": 388
}
},
{
"ph": "f", "id": 388, "pid": 2337800, "tid": 2337800, "ts": 6071193107458.561,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193140570.332, "dur": 19801.471,
"args": {
"External id": 77, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 391, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 391, "pid": 0, "tid": 7, "ts": 6071193140570.332,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107462.697, "dur": 5.609,
"args": {
"External id": 77, "cbid": 307, "correlation": 391
}
},
{
"ph": "s", "id": 391, "pid": 2337800, "tid": 2337800, "ts": 6071193107462.697,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6071193160372.666, "dur": 1657.024,
"args": {
"External id": 78, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 402, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 402, "pid": 0, "tid": 7, "ts": 6071193160372.666,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107493.844, "dur": 7.391,
"args": {
"External id": 78, "cbid": 307, "correlation": 402
}
},
{
"ph": "s", "id": 402, "pid": 2337800, "tid": 2337800, "ts": 6071193107493.844,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193162030.650, "dur": 75.200,
"args": {
"External id": 79, "device": 0, "context": 1, "stream": 7, "correlation": 409, "bytes": 77194752, "memory bandwidth (GB/s)": 1026.5259574468084
}
},
{
"ph": "f", "id": 409, "pid": 0, "tid": 7, "ts": 6071193162030.650,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800,
"ts": 6071193107531.160, "dur": 29.655,
"args": {
"External id": 79, "cbid": 41, "correlation": 409
}
},
{
"ph": "s", "id": 409, "pid": 2337800, "tid": 2337800, "ts": 6071193107531.160,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193107580.274, "dur": 0.271,
"args": {
"External id": 79, "cbid": 200, "correlation": 420
}
},
{
"ph": "f", "id": 420, "pid": 2337800, "tid": 2337800, "ts": 6071193107580.274,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6071193162106.778, "dur": 5807.648,
"args": {
"External id": 79, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 423, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 423, "pid": 0, "tid": 7, "ts": 6071193162106.778,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107582.107, "dur": 5.929,
"args": {
"External id": 79, "cbid": 307, "correlation": 423
}
},
{
"ph": "s", "id": 423, "pid": 2337800, "tid": 2337800, "ts": 6071193107582.107,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6071193167916.313, "dur": 2.688,
"args": {
"External id": 80, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 435, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 435, "pid": 0, "tid": 7, "ts": 6071193167916.313,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107624.000, "dur": 6.861,
"args": {
"External id": 80, "cbid": 307, "correlation": 435
}
},
{
"ph": "s", "id": 435, "pid": 2337800, "tid": 2337800, "ts": 6071193107624.000,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6071193167919.929, "dur": 1.696,
"args": {
"External id": 81, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 440, "pid": 0, "tid": 7, "ts": 6071193167919.929,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107653.645, "dur": 6.119,
"args": {
"External id": 81, "cbid": 307, "correlation": 440
}
},
{
"ph": "s", "id": 440, "pid": 2337800, "tid": 2337800, "ts": 6071193107653.645,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6071193167922.553, "dur": 1.376,
"args": {
"External id": 85, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 451, "pid": 0, "tid": 7, "ts": 6071193167922.553,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193107827.848, "dur": 10.375,
"args": {
"External id": 85, "cbid": 211, "correlation": 451
}
},
{
"ph": "s", "id": 451, "pid": 2337800, "tid": 2337800, "ts": 6071193107827.848,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800,
"ts": 6071193107884.503, "dur": 2.764,
"args": {
"External id": 46, "cbid": 135, "correlation": 459
}
},
{
"ph": "f", "id": 459, "pid": 2337800, "tid": 2337800, "ts": 6071193107884.503,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6071193167924.762, "dur": 61.343,
"args": {
"External id": 538, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 468, "pid": 0, "tid": 7, "ts": 6071193167924.762,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193108355.643, "dur": 32.590,
"args": {
"External id": 538, "cbid": 307, "correlation": 468
}
},
{
"ph": "s", "id": 468, "pid": 2337800, "tid": 2340515, "ts": 6071193108355.643,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6071193167987.482, "dur": 2.399,
"args": {
"External id": 539, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 472, "pid": 0, "tid": 7, "ts": 6071193167987.482,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193108412.139, "dur": 6.249,
"args": {
"External id": 539, "cbid": 307, "correlation": 472
}
},
{
"ph": "s", "id": 472, "pid": 2337800, "tid": 2340515, "ts": 6071193108412.139,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193167990.809, "dur": 47.648,
"args": {
"External id": 540, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 476, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 476, "pid": 0, "tid": 7, "ts": 6071193167990.809,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193108452.099, "dur": 5.208,
"args": {
"External id": 540, "cbid": 307, "correlation": 476
}
},
{
"ph": "s", "id": 476, "pid": 2337800, "tid": 2340515, "ts": 6071193108452.099,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193108501.533, "dur": 2.224,
"args": {
"External id": 536, "cbid": 135, "correlation": 481
}
},
{
"ph": "f", "id": 481, "pid": 2337800, "tid": 2340515, "ts": 6071193108501.533,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193108507.282, "dur": 0.411,
"args": {
"External id": 536, "cbid": 135, "correlation": 486
}
},
{
"ph": "f", "id": 486, "pid": 2337800, "tid": 2340515, "ts": 6071193108507.282,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193108510.106, "dur": 0.431,
"args": {
"External id": 536, "cbid": 135, "correlation": 491
}
},
{
"ph": "f", "id": 491, "pid": 2337800, "tid": 2340515, "ts": 6071193108510.106,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193168039.321, "dur": 55.232,
"args": {
"External id": 553, "device": 0, "context": 1, "stream": 7, "correlation": 525, "bytes": 50331648, "memory bandwidth (GB/s)": 911.2769409038239
}
},
{
"ph": "f", "id": 525, "pid": 0, "tid": 7, "ts": 6071193168039.321,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193108626.061, "dur": 19.129,
"args": {
"External id": 553, "cbid": 41, "correlation": 525
}
},
{
"ph": "s", "id": 525, "pid": 2337800, "tid": 2340515, "ts": 6071193108626.061,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7,
"ts": 6071193168104.057, "dur": 21092.190,
"args": {
"External id": 558, "device": 0, "context": 1, "stream": 7, "correlation": 531, "bytes": 50331648, "memory bandwidth (GB/s)": 2.386269420102891
}
},
{
"ph": "f", "id": 531, "pid": 0, "tid": 7, "ts": 6071193168104.057,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193108693.763, "dur": 81783.001,
"args": {
"External id": 558, "cbid": 41, "correlation": 531
}
},
{
"ph": "s", "id": 531, "pid": 2337800, "tid": 2340515, "ts": 6071193108693.763,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515,
"ts": 6071193190484.065, "dur": 15.543,
"args": {
"External id": 558, "cbid": 131, "correlation": 532
}
},
{
"ph": "s", "id": 532, "pid": 2337800, "tid": 2340515, "ts": 6071193190484.065,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6071193195203.991, "dur": 7.136,
"args": {
"External id": 93, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 570, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 570, "pid": 0, "tid": 7, "ts": 6071193195203.991,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195157.671, "dur": 48.223,
"args": {
"External id": 93, "cbid": 307, "correlation": 570
}
},
{
"ph": "s", "id": 570, "pid": 2337800, "tid": 2337800, "ts": 6071193195157.671,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6071193195243.383, "dur": 1.600,
"args": {
"External id": 94, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 578, "pid": 0, "tid": 7, "ts": 6071193195243.383,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195234.337, "dur": 8.353,
"args": {
"External id": 94, "cbid": 307, "correlation": 578
}
},
{
"ph": "s", "id": 578, "pid": 2337800, "tid": 2337800, "ts": 6071193195234.337,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193195280.983, "dur": 43.840,
"args": {
"External id": 95, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 586, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 586, "pid": 0, "tid": 7, "ts": 6071193195280.983,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195272.645, "dur": 7.321,
"args": {
"External id": 95, "cbid": 307, "correlation": 586
}
},
{
"ph": "s", "id": 586, "pid": 2337800, "tid": 2337800, "ts": 6071193195272.645,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6071193195454.198, "dur": 1.377,
"args": {
"External id": 116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 593, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 593, "pid": 0, "tid": 7, "ts": 6071193195454.198,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195444.274, "dur": 8.432,
"args": {
"External id": 116, "cbid": 307, "correlation": 593
}
},
{
"ph": "s", "id": 593, "pid": 2337800, "tid": 2337800, "ts": 6071193195444.274,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6071193195481.687, "dur": 31.552,
"args": {
"External id": 117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 600, "pid": 0, "tid": 7, "ts": 6071193195481.687,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195473.848, "dur": 6.530,
"args": {
"External id": 117, "cbid": 307, "correlation": 600
}
},
{
"ph": "s", "id": 600, "pid": 2337800, "tid": 2337800, "ts": 6071193195473.848,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6071193195514.135, "dur": 1.376,
"args": {
"External id": 118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 607, "pid": 0, "tid": 7, "ts": 6071193195514.135,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195499.106, "dur": 6.580,
"args": {
"External id": 118, "cbid": 307, "correlation": 607
}
},
{
"ph": "s", "id": 607, "pid": 2337800, "tid": 2337800, "ts": 6071193195499.106,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193195626.779, "dur": 2.503,
"args": {
"External id": 119, "cbid": 200, "correlation": 622
}
},
{
"ph": "f", "id": 622, "pid": 2337800, "tid": 2337800, "ts": 6071193195626.779,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193195643.095, "dur": 26833.788,
"args": {
"External id": 119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 624, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 624, "pid": 0, "tid": 7, "ts": 6071193195643.095,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195632.868, "dur": 8.302,
"args": {
"External id": 119, "cbid": 307, "correlation": 624
}
},
{
"ph": "s", "id": 624, "pid": 2337800, "tid": 2337800, "ts": 6071193195632.868,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6071193222477.876, "dur": 5565.599,
"args": {
"External id": 120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 645, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 645, "pid": 0, "tid": 7, "ts": 6071193222477.876,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195684.486, "dur": 8.132,
"args": {
"External id": 120, "cbid": 307, "correlation": 645
}
},
{
"ph": "s", "id": 645, "pid": 2337800, "tid": 2337800, "ts": 6071193195684.486,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193195716.875, "dur": 0.430,
"args": {
"External id": 121, "cbid": 200, "correlation": 658
}
},
{
"ph": "f", "id": 658, "pid": 2337800, "tid": 2337800, "ts": 6071193195716.875,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193228044.435, "dur": 19615.038,
"args": {
"External id": 121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 661, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 661, "pid": 0, "tid": 7, "ts": 6071193228044.435,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195721.111, "dur": 5.428,
"args": {
"External id": 121, "cbid": 307, "correlation": 661
}
},
{
"ph": "s", "id": 661, "pid": 2337800, "tid": 2337800, "ts": 6071193195721.111,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6071193247660.369, "dur": 1660.351,
"args": {
"External id": 122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 672, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 672, "pid": 0, "tid": 7, "ts": 6071193247660.369,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195753.209, "dur": 6.720,
"args": {
"External id": 122, "cbid": 307, "correlation": 672
}
},
{
"ph": "s", "id": 672, "pid": 2337800, "tid": 2337800, "ts": 6071193195753.209,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193249321.617, "dur": 75.328,
"args": {
"External id": 123, "device": 0, "context": 1, "stream": 7, "correlation": 679, "bytes": 77194752, "memory bandwidth (GB/s)": 1024.7816482582837
}
},
{
"ph": "f", "id": 679, "pid": 0, "tid": 7, "ts": 6071193249321.617,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800,
"ts": 6071193195794.952, "dur": 30.246,
"args": {
"External id": 123, "cbid": 41, "correlation": 679
}
},
{
"ph": "s", "id": 679, "pid": 2337800, "tid": 2337800, "ts": 6071193195794.952,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193195846.049, "dur": 0.651,
"args": {
"External id": 123, "cbid": 200, "correlation": 690
}
},
{
"ph": "f", "id": 690, "pid": 2337800, "tid": 2337800, "ts": 6071193195846.049,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6071193249397.744, "dur": 5816.192,
"args": {
"External id": 123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 693, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 693, "pid": 0, "tid": 7, "ts": 6071193249397.744,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195848.874, "dur": 5.508,
"args": {
"External id": 123, "cbid": 307, "correlation": 693
}
},
{
"ph": "s", "id": 693, "pid": 2337800, "tid": 2337800, "ts": 6071193195848.874,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6071193255215.760, "dur": 2.656,
"args": {
"External id": 124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 705, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 705, "pid": 0, "tid": 7, "ts": 6071193255215.760,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195892.219, "dur": 6.600,
"args": {
"External id": 124, "cbid": 307, "correlation": 705
}
},
{
"ph": "s", "id": 705, "pid": 2337800, "tid": 2337800, "ts": 6071193195892.219,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6071193255219.376, "dur": 1.696,
"args": {
"External id": 125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 710, "pid": 0, "tid": 7, "ts": 6071193255219.376,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193195920.772, "dur": 5.518,
"args": {
"External id": 125, "cbid": 307, "correlation": 710
}
},
{
"ph": "s", "id": 710, "pid": 2337800, "tid": 2337800, "ts": 6071193195920.772,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6071193255221.968, "dur": 1.376,
"args": {
"External id": 129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 721, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 721, "pid": 0, "tid": 7, "ts": 6071193255221.968,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193196109.667, "dur": 9.504,
"args": {
"External id": 129, "cbid": 211, "correlation": 721
}
},
{
"ph": "s", "id": 721, "pid": 2337800, "tid": 2337800, "ts": 6071193196109.667,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800,
"ts": 6071193196161.815, "dur": 3.145,
"args": {
"External id": 90, "cbid": 135, "correlation": 729
}
},
{
"ph": "f", "id": 729, "pid": 2337800, "tid": 2337800, "ts": 6071193196161.815,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6071193255224.304, "dur": 61.536,
"args": {
"External id": 561, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 738, "pid": 0, "tid": 7, "ts": 6071193255224.304,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193196642.580, "dur": 38.368,
"args": {
"External id": 561, "cbid": 307, "correlation": 738
}
},
{
"ph": "s", "id": 738, "pid": 2337800, "tid": 2340515, "ts": 6071193196642.580,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6071193255286.832, "dur": 2.208,
"args": {
"External id": 562, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 742, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 742, "pid": 0, "tid": 7, "ts": 6071193255286.832,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193196703.392, "dur": 7.441,
"args": {
"External id": 562, "cbid": 307, "correlation": 742
}
},
{
"ph": "s", "id": 742, "pid": 2337800, "tid": 2340515, "ts": 6071193196703.392,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193255289.936, "dur": 47.328,
"args": {
"External id": 563, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 746, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 746, "pid": 0, "tid": 7, "ts": 6071193255289.936,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193196744.724, "dur": 6.270,
"args": {
"External id": 563, "cbid": 307, "correlation": 746
}
},
{
"ph": "s", "id": 746, "pid": 2337800, "tid": 2340515, "ts": 6071193196744.724,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193196796.743, "dur": 2.944,
"args": {
"External id": 559, "cbid": 135, "correlation": 751
}
},
{
"ph": "f", "id": 751, "pid": 2337800, "tid": 2340515, "ts": 6071193196796.743,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193196804.314, "dur": 0.691,
"args": {
"External id": 559, "cbid": 135, "correlation": 756
}
},
{
"ph": "f", "id": 756, "pid": 2337800, "tid": 2340515, "ts": 6071193196804.314,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193196808.220, "dur": 0.501,
"args": {
"External id": 559, "cbid": 135, "correlation": 761
}
},
{
"ph": "f", "id": 761, "pid": 2337800, "tid": 2340515, "ts": 6071193196808.220,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193255338.224, "dur": 55.488,
"args": {
"External id": 576, "device": 0, "context": 1, "stream": 7, "correlation": 795, "bytes": 50331648, "memory bandwidth (GB/s)": 907.0726643598616
}
},
{
"ph": "f", "id": 795, "pid": 0, "tid": 7, "ts": 6071193255338.224,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193196940.860, "dur": 21.042,
"args": {
"External id": 576, "cbid": 41, "correlation": 795
}
},
{
"ph": "s", "id": 795, "pid": 2337800, "tid": 2340515, "ts": 6071193196940.860,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7,
"ts": 6071193255397.168, "dur": 21805.245,
"args": {
"External id": 581, "device": 0, "context": 1, "stream": 7, "correlation": 801, "bytes": 50331648, "memory bandwidth (GB/s)": 2.3082358395881357
}
},
{
"ph": "f", "id": 801, "pid": 0, "tid": 7, "ts": 6071193255397.168,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193197021.381, "dur": 81131.458,
"args": {
"External id": 581, "cbid": 41, "correlation": 801
}
},
{
"ph": "s", "id": 801, "pid": 2337800, "tid": 2340515, "ts": 6071193197021.381,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515,
"ts": 6071193278157.306, "dur": 14.943,
"args": {
"External id": 581, "cbid": 131, "correlation": 802
}
},
{
"ph": "s", "id": 802, "pid": 2337800, "tid": 2340515, "ts": 6071193278157.306,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6071193282883.853, "dur": 6.976,
"args": {
"External id": 137, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 840, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 840, "pid": 0, "tid": 7, "ts": 6071193282883.853,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193282835.910, "dur": 50.006,
"args": {
"External id": 137, "cbid": 307, "correlation": 840
}
},
{
"ph": "s", "id": 840, "pid": 2337800, "tid": 2337800, "ts": 6071193282835.910,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6071193282921.677, "dur": 1.632,
"args": {
"External id": 138, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 848, "pid": 0, "tid": 7, "ts": 6071193282921.677,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193282912.726, "dur": 8.343,
"args": {
"External id": 138, "cbid": 307, "correlation": 848
}
},
{
"ph": "s", "id": 848, "pid": 2337800, "tid": 2337800, "ts": 6071193282912.726,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193282956.461, "dur": 43.712,
"args": {
"External id": 139, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 856, "pid": 0, "tid": 7, "ts": 6071193282956.461,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193282947.699, "dur": 7.772,
"args": {
"External id": 139, "cbid": 307, "correlation": 856
}
},
{
"ph": "s", "id": 856, "pid": 2337800, "tid": 2337800, "ts": 6071193282947.699,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6071193283105.069, "dur": 1.344,
"args": {
"External id": 160, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 863, "pid": 0, "tid": 7, "ts": 6071193283105.069,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283094.821, "dur": 9.114,
"args": {
"External id": 160, "cbid": 307, "correlation": 863
}
},
{
"ph": "s", "id": 863, "pid": 2337800, "tid": 2337800, "ts": 6071193283094.821,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6071193283132.941, "dur": 32.000,
"args": {
"External id": 161, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 870, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 870, "pid": 0, "tid": 7, "ts": 6071193283132.941,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283124.936, "dur": 6.520,
"args": {
"External id": 161, "cbid": 307, "correlation": 870
}
},
{
"ph": "s", "id": 870, "pid": 2337800, "tid": 2337800, "ts": 6071193283124.936,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6071193283165.965, "dur": 1.408,
"args": {
"External id": 162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 877, "pid": 0, "tid": 7, "ts": 6071193283165.965,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283150.575, "dur": 7.061,
"args": {
"External id": 162, "cbid": 307, "correlation": 877
}
},
{
"ph": "s", "id": 877, "pid": 2337800, "tid": 2337800, "ts": 6071193283150.575,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193283284.246, "dur": 2.935,
"args": {
"External id": 163, "cbid": 200, "correlation": 892
}
},
{
"ph": "f", "id": 892, "pid": 2337800, "tid": 2337800, "ts": 6071193283284.246,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193283326.669, "dur": 26839.005,
"args": {
"External id": 163, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 894, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 894, "pid": 0, "tid": 7, "ts": 6071193283326.669,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283291.027, "dur": 33.410,
"args": {
"External id": 163, "cbid": 307, "correlation": 894
}
},
{
"ph": "s", "id": 894, "pid": 2337800, "tid": 2337800, "ts": 6071193283291.027,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6071193310166.602, "dur": 5565.183,
"args": {
"External id": 164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 915, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 915, "pid": 0, "tid": 7, "ts": 6071193310166.602,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283371.638, "dur": 8.934,
"args": {
"External id": 164, "cbid": 307, "correlation": 915
}
},
{
"ph": "s", "id": 915, "pid": 2337800, "tid": 2337800, "ts": 6071193283371.638,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193283405.710, "dur": 0.430,
"args": {
"External id": 165, "cbid": 200, "correlation": 928
}
},
{
"ph": "f", "id": 928, "pid": 2337800, "tid": 2337800, "ts": 6071193283405.710,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193315732.585, "dur": 19614.879,
"args": {
"External id": 165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 931, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 931, "pid": 0, "tid": 7, "ts": 6071193315732.585,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283410.577, "dur": 6.109,
"args": {
"External id": 165, "cbid": 307, "correlation": 931
}
},
{
"ph": "s", "id": 931, "pid": 2337800, "tid": 2337800, "ts": 6071193283410.577,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6071193335348.392, "dur": 1660.319,
"args": {
"External id": 166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 942, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 942, "pid": 0, "tid": 7, "ts": 6071193335348.392,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283444.258, "dur": 7.010,
"args": {
"External id": 166, "cbid": 307, "correlation": 942
}
},
{
"ph": "s", "id": 942, "pid": 2337800, "tid": 2337800, "ts": 6071193283444.258,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193337009.703, "dur": 75.104,
"args": {
"External id": 167, "device": 0, "context": 1, "stream": 7, "correlation": 949, "bytes": 77194752, "memory bandwidth (GB/s)": 1027.83809118023
}
},
{
"ph": "f", "id": 949, "pid": 0, "tid": 7, "ts": 6071193337009.703,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800,
"ts": 6071193283483.206, "dur": 32.389,
"args": {
"External id": 167, "cbid": 41, "correlation": 949
}
},
{
"ph": "s", "id": 949, "pid": 2337800, "tid": 2337800, "ts": 6071193283483.206,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193283536.787, "dur": 0.671,
"args": {
"External id": 167, "cbid": 200, "correlation": 960
}
},
{
"ph": "f", "id": 960, "pid": 2337800, "tid": 2337800, "ts": 6071193283536.787,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6071193337085.767, "dur": 5816.352,
"args": {
"External id": 167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 963, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 963, "pid": 0, "tid": 7, "ts": 6071193337085.767,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283539.782, "dur": 5.698,
"args": {
"External id": 167, "cbid": 307, "correlation": 963
}
},
{
"ph": "s", "id": 963, "pid": 2337800, "tid": 2337800, "ts": 6071193283539.782,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6071193342904.135, "dur": 2.688,
"args": {
"External id": 168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 975, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 975, "pid": 0, "tid": 7, "ts": 6071193342904.135,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283583.748, "dur": 6.540,
"args": {
"External id": 168, "cbid": 307, "correlation": 975
}
},
{
"ph": "s", "id": 975, "pid": 2337800, "tid": 2337800, "ts": 6071193283583.748,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6071193342907.783, "dur": 1.696,
"args": {
"External id": 169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 980, "pid": 0, "tid": 7, "ts": 6071193342907.783,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283614.064, "dur": 6.059,
"args": {
"External id": 169, "cbid": 307, "correlation": 980
}
},
{
"ph": "s", "id": 980, "pid": 2337800, "tid": 2337800, "ts": 6071193283614.064,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6071193342910.375, "dur": 1.344,
"args": {
"External id": 173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 991, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 991, "pid": 0, "tid": 7, "ts": 6071193342910.375,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193283816.599, "dur": 10.216,
"args": {
"External id": 173, "cbid": 211, "correlation": 991
}
},
{
"ph": "s", "id": 991, "pid": 2337800, "tid": 2337800, "ts": 6071193283816.599,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800,
"ts": 6071193283878.753, "dur": 2.854,
"args": {
"External id": 134, "cbid": 135, "correlation": 999
}
},
{
"ph": "f", "id": 999, "pid": 2337800, "tid": 2337800, "ts": 6071193283878.753,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6071193342912.583, "dur": 61.152,
"args": {
"External id": 584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1008, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1008, "pid": 0, "tid": 7, "ts": 6071193342912.583,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193284369.032, "dur": 38.278,
"args": {
"External id": 584, "cbid": 307, "correlation": 1008
}
},
{
"ph": "s", "id": 1008, "pid": 2337800, "tid": 2340515, "ts": 6071193284369.032,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6071193342975.655, "dur": 2.240,
"args": {
"External id": 585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1012, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 1012, "pid": 0, "tid": 7, "ts": 6071193342975.655,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193284431.006, "dur": 6.690,
"args": {
"External id": 585, "cbid": 307, "correlation": 1012
}
},
{
"ph": "s", "id": 1012, "pid": 2337800, "tid": 2340515, "ts": 6071193284431.006,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193342978.791, "dur": 47.776,
"args": {
"External id": 586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1016, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1016, "pid": 0, "tid": 7, "ts": 6071193342978.791,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193284473.079, "dur": 6.610,
"args": {
"External id": 586, "cbid": 307, "correlation": 1016
}
},
{
"ph": "s", "id": 1016, "pid": 2337800, "tid": 2340515, "ts": 6071193284473.079,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193284525.608, "dur": 2.574,
"args": {
"External id": 582, "cbid": 135, "correlation": 1021
}
},
{
"ph": "f", "id": 1021, "pid": 2337800, "tid": 2340515, "ts": 6071193284525.608,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193284532.348, "dur": 0.651,
"args": {
"External id": 582, "cbid": 135, "correlation": 1026
}
},
{
"ph": "f", "id": 1026, "pid": 2337800, "tid": 2340515, "ts": 6071193284532.348,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193284536.034, "dur": 0.501,
"args": {
"External id": 582, "cbid": 135, "correlation": 1031
}
},
{
"ph": "f", "id": 1031, "pid": 2337800, "tid": 2340515, "ts": 6071193284536.034,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193343027.431, "dur": 55.616,
"args": {
"External id": 599, "device": 0, "context": 1, "stream": 7, "correlation": 1065, "bytes": 50331648, "memory bandwidth (GB/s)": 904.9850402761795
}
},
{
"ph": "f", "id": 1065, "pid": 0, "tid": 7, "ts": 6071193343027.431,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193284673.812, "dur": 20.631,
"args": {
"External id": 599, "cbid": 41, "correlation": 1065
}
},
{
"ph": "s", "id": 1065, "pid": 2337800, "tid": 2340515, "ts": 6071193284673.812,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7,
"ts": 6071193343085.287, "dur": 20094.557,
"args": {
"External id": 604, "device": 0, "context": 1, "stream": 7, "correlation": 1071, "bytes": 50331648, "memory bandwidth (GB/s)": 2.5047403632734975
}
},
{
"ph": "f", "id": 1071, "pid": 0, "tid": 7, "ts": 6071193343085.287,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193284754.183, "dur": 79179.284,
"args": {
"External id": 604, "cbid": 41, "correlation": 1071
}
},
{
"ph": "s", "id": 1071, "pid": 2337800, "tid": 2340515, "ts": 6071193284754.183,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515,
"ts": 6071193363938.906, "dur": 14.341,
"args": {
"External id": 604, "cbid": 131, "correlation": 1072
}
},
{
"ph": "s", "id": 1072, "pid": 2337800, "tid": 2340515, "ts": 6071193363938.906,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6071193369491.492, "dur": 7.136,
"args": {
"External id": 181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1110, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1110, "pid": 0, "tid": 7, "ts": 6071193369491.492,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369448.282, "dur": 45.078,
"args": {
"External id": 181, "cbid": 307, "correlation": 1110
}
},
{
"ph": "s", "id": 1110, "pid": 2337800, "tid": 2337800, "ts": 6071193369448.282,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6071193369528.836, "dur": 1.568,
"args": {
"External id": 182, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1118, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1118, "pid": 0, "tid": 7, "ts": 6071193369528.836,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369519.770, "dur": 8.163,
"args": {
"External id": 182, "cbid": 307, "correlation": 1118
}
},
{
"ph": "s", "id": 1118, "pid": 2337800, "tid": 2337800, "ts": 6071193369519.770,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193369564.740, "dur": 43.712,
"args": {
"External id": 183, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1126, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1126, "pid": 0, "tid": 7, "ts": 6071193369564.740,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369556.275, "dur": 7.532,
"args": {
"External id": 183, "cbid": 307, "correlation": 1126
}
},
{
"ph": "s", "id": 1126, "pid": 2337800, "tid": 2337800, "ts": 6071193369556.275,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6071193369710.852, "dur": 1.344,
"args": {
"External id": 204, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1133, "pid": 0, "tid": 7, "ts": 6071193369710.852,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369700.753, "dur": 8.763,
"args": {
"External id": 204, "cbid": 307, "correlation": 1133
}
},
{
"ph": "s", "id": 1133, "pid": 2337800, "tid": 2337800, "ts": 6071193369700.753,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6071193369738.820, "dur": 32.160,
"args": {
"External id": 205, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1140, "pid": 0, "tid": 7, "ts": 6071193369738.820,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369731.249, "dur": 6.159,
"args": {
"External id": 205, "cbid": 307, "correlation": 1140
}
},
{
"ph": "s", "id": 1140, "pid": 2337800, "tid": 2337800, "ts": 6071193369731.249,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6071193369771.844, "dur": 1.408,
"args": {
"External id": 206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1147, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 1147, "pid": 0, "tid": 7, "ts": 6071193369771.844,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369757.729, "dur": 6.900,
"args": {
"External id": 206, "cbid": 307, "correlation": 1147
}
},
{
"ph": "s", "id": 1147, "pid": 2337800, "tid": 2337800, "ts": 6071193369757.729,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193369879.172, "dur": 2.454,
"args": {
"External id": 207, "cbid": 200, "correlation": 1162
}
},
{
"ph": "f", "id": 1162, "pid": 2337800, "tid": 2337800, "ts": 6071193369879.172,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193369894.628, "dur": 26815.421,
"args": {
"External id": 207, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1164, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 1164, "pid": 0, "tid": 7, "ts": 6071193369894.628,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369884.981, "dur": 7.681,
"args": {
"External id": 207, "cbid": 307, "correlation": 1164
}
},
{
"ph": "s", "id": 1164, "pid": 2337800, "tid": 2337800, "ts": 6071193369884.981,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6071193396711.009, "dur": 5565.087,
"args": {
"External id": 208, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1185, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 1185, "pid": 0, "tid": 7, "ts": 6071193396711.009,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369934.876, "dur": 8.223,
"args": {
"External id": 208, "cbid": 307, "correlation": 1185
}
},
{
"ph": "s", "id": 1185, "pid": 2337800, "tid": 2337800, "ts": 6071193369934.876,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193369966.644, "dur": 0.421,
"args": {
"External id": 209, "cbid": 200, "correlation": 1198
}
},
{
"ph": "f", "id": 1198, "pid": 2337800, "tid": 2337800, "ts": 6071193369966.644,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6071193402277.056, "dur": 19591.422,
"args": {
"External id": 209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1201, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 1201, "pid": 0, "tid": 7, "ts": 6071193402277.056,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193369971.001, "dur": 5.558,
"args": {
"External id": 209, "cbid": 307, "correlation": 1201
}
},
{
"ph": "s", "id": 1201, "pid": 2337800, "tid": 2337800, "ts": 6071193369971.001,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6071193421869.374, "dur": 1662.208,
"args": {
"External id": 210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1212, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 1212, "pid": 0, "tid": 7, "ts": 6071193421869.374,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193370004.912, "dur": 6.970,
"args": {
"External id": 210, "cbid": 307, "correlation": 1212
}
},
{
"ph": "s", "id": 1212, "pid": 2337800, "tid": 2337800, "ts": 6071193370004.912,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193423532.445, "dur": 75.137,
"args": {
"External id": 211, "device": 0, "context": 1, "stream": 7, "correlation": 1219, "bytes": 77194752, "memory bandwidth (GB/s)": 1027.3866670215739
}
},
{
"ph": "f", "id": 1219, "pid": 0, "tid": 7, "ts": 6071193423532.445,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2337800,
"ts": 6071193370041.126, "dur": 29.795,
"args": {
"External id": 211, "cbid": 41, "correlation": 1219
}
},
{
"ph": "s", "id": 1219, "pid": 2337800, "tid": 2337800, "ts": 6071193370041.126,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2337800, "tid": 2337800,
"ts": 6071193370091.743, "dur": 0.570,
"args": {
"External id": 211, "cbid": 200, "correlation": 1230
}
},
{
"ph": "f", "id": 1230, "pid": 2337800, "tid": 2337800, "ts": 6071193370091.743,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6071193423608.414, "dur": 5815.999,
"args": {
"External id": 211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1233, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1233, "pid": 0, "tid": 7, "ts": 6071193423608.414,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193370094.617, "dur": 5.909,
"args": {
"External id": 211, "cbid": 307, "correlation": 1233
}
},
{
"ph": "s", "id": 1233, "pid": 2337800, "tid": 2337800, "ts": 6071193370094.617,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6071193429426.333, "dur": 2.848,
"args": {
"External id": 212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1245, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1245, "pid": 0, "tid": 7, "ts": 6071193429426.333,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193370137.542, "dur": 7.130,
"args": {
"External id": 212, "cbid": 307, "correlation": 1245
}
},
{
"ph": "s", "id": 1245, "pid": 2337800, "tid": 2337800, "ts": 6071193370137.542,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6071193429430.173, "dur": 1.696,
"args": {
"External id": 213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1250, "pid": 0, "tid": 7, "ts": 6071193429430.173,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193370168.789, "dur": 6.550,
"args": {
"External id": 213, "cbid": 307, "correlation": 1250
}
},
{
"ph": "s", "id": 1250, "pid": 2337800, "tid": 2337800, "ts": 6071193370168.789,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6071193429432.765, "dur": 1.344,
"args": {
"External id": 217, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1261, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1261, "pid": 0, "tid": 7, "ts": 6071193429432.765,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2337800, "tid": 2337800,
"ts": 6071193370377.203, "dur": 10.005,
"args": {
"External id": 217, "cbid": 211, "correlation": 1261
}
},
{
"ph": "s", "id": 1261, "pid": 2337800, "tid": 2337800, "ts": 6071193370377.203,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2337800,
"ts": 6071193370425.235, "dur": 2.865,
"args": {
"External id": 178, "cbid": 135, "correlation": 1269
}
},
{
"ph": "f", "id": 1269, "pid": 2337800, "tid": 2337800, "ts": 6071193370425.235,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6071193429435.101, "dur": 61.184,
"args": {
"External id": 607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1278, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1278, "pid": 0, "tid": 7, "ts": 6071193429435.101,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193370876.065, "dur": 33.871,
"args": {
"External id": 607, "cbid": 307, "correlation": 1278
}
},
{
"ph": "s", "id": 1278, "pid": 2337800, "tid": 2340515, "ts": 6071193370876.065,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6071193429497.181, "dur": 2.336,
"args": {
"External id": 608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 1282, "pid": 0, "tid": 7, "ts": 6071193429497.181,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193370932.911, "dur": 5.939,
"args": {
"External id": 608, "cbid": 307, "correlation": 1282
}
},
{
"ph": "s", "id": 1282, "pid": 2337800, "tid": 2340515, "ts": 6071193370932.911,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6071193429500.477, "dur": 48.352,
"args": {
"External id": 609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1286, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1286, "pid": 0, "tid": 7, "ts": 6071193429500.477,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2337800, "tid": 2340515,
"ts": 6071193370972.390, "dur": 5.138,
"args": {
"External id": 609, "cbid": 307, "correlation": 1286
}
},
{
"ph": "s", "id": 1286, "pid": 2337800, "tid": 2340515, "ts": 6071193370972.390,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193371020.253, "dur": 2.604,
"args": {
"External id": 605, "cbid": 135, "correlation": 1291
}
},
{
"ph": "f", "id": 1291, "pid": 2337800, "tid": 2340515, "ts": 6071193371020.253,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193371026.472, "dur": 0.501,
"args": {
"External id": 605, "cbid": 135, "correlation": 1296
}
},
{
"ph": "f", "id": 1296, "pid": 2337800, "tid": 2340515, "ts": 6071193371026.472,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2337800, "tid": 2340515,
"ts": 6071193371029.236, "dur": 0.441,
"args": {
"External id": 605, "cbid": 135, "correlation": 1301
}
},
{
"ph": "f", "id": 1301, "pid": 2337800, "tid": 2340515, "ts": 6071193371029.236,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6071193429549.757, "dur": 55.424,
"args": {
"External id": 622, "device": 0, "context": 1, "stream": 7, "correlation": 1335, "bytes": 50331648, "memory bandwidth (GB/s)": 908.1200923787529
}
},
{
"ph": "f", "id": 1335, "pid": 0, "tid": 7, "ts": 6071193429549.757,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193371144.450, "dur": 16.475,
"args": {
"External id": 622, "cbid": 41, "correlation": 1335
}
},
{
"ph": "s", "id": 1335, "pid": 2337800, "tid": 2340515, "ts": 6071193371144.450,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pageable)", "pid": 0, "tid": 7,
"ts": 6071193429607.261, "dur": 21396.606,
"args": {
"External id": 627, "device": 0, "context": 1, "stream": 7, "correlation": 1341, "bytes": 50331648, "memory bandwidth (GB/s)": 2.352319241659168
}
},
{
"ph": "f", "id": 1341, "pid": 0, "tid": 7, "ts": 6071193429607.261,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2337800, "tid": 2340515,
"ts": 6071193371214.405, "dur": 80904.877,
"args": {
"External id": 627, "cbid": 41, "correlation": 1341
}
},
{
"ph": "s", "id": 1341, "pid": 2337800, "tid": 2340515, "ts": 6071193371214.405,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 2337800, "tid": 2340515,
"ts": 6071193452123.899, "dur": 14.261,
"args": {
"External id": 627, "cbid": 131, "correlation": 1342
}
},
{
"ph": "s", "id": 1342, "pid": 2337800, "tid": 2340515, "ts": 6071193452123.899,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 2337800, "tid": 2337800,
"ts": 6071193456547.529, "dur": 31.217,
"args": {
"cbid": 165, "correlation": 1353
}
},
{
"ph": "s", "id": 1353, "pid": 2337800, "tid": 2337800, "ts": 6071193456547.529,
"cat": "ac2g", "name": "ac2g"
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0,
"args": {
"labels": "CPU"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 0,
"args": {
"sort_index": 2337800
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0,
"args": {
"labels": "GPU 0"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 0,
"args": {
"sort_index": 5000000
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0,
"args": {
"labels": "GPU 1"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 1, "tid": 0,
"args": {
"sort_index": 5000001
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0,
"args": {
"labels": "GPU 2"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2, "tid": 0,
"args": {
"sort_index": 5000002
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0,
"args": {
"labels": "GPU 3"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 3, "tid": 0,
"args": {
"sort_index": 5000003
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0,
"args": {
"labels": "GPU 4"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 4, "tid": 0,
"args": {
"sort_index": 5000004
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0,
"args": {
"labels": "GPU 5"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 5, "tid": 0,
"args": {
"sort_index": 5000005
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0,
"args": {
"labels": "GPU 6"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 6, "tid": 0,
"args": {
"sort_index": 5000006
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0,
"args": {
"labels": "GPU 7"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 7, "tid": 0,
"args": {
"sort_index": 5000007
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0,
"args": {
"labels": "GPU 8"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 8, "tid": 0,
"args": {
"sort_index": 5000008
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0,
"args": {
"labels": "GPU 9"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 9, "tid": 0,
"args": {
"sort_index": 5000009
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0,
"args": {
"labels": "GPU 10"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 10, "tid": 0,
"args": {
"sort_index": 5000010
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0,
"args": {
"labels": "GPU 11"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 11, "tid": 0,
"args": {
"sort_index": 5000011
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0,
"args": {
"labels": "GPU 12"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 12, "tid": 0,
"args": {
"sort_index": 5000012
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0,
"args": {
"labels": "GPU 13"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 13, "tid": 0,
"args": {
"sort_index": 5000013
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0,
"args": {
"labels": "GPU 14"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 14, "tid": 0,
"args": {
"sort_index": 5000014
}
},
{
"name": "process_name", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0,
"args": {
"labels": "GPU 15"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 15, "tid": 0,
"args": {
"sort_index": 5000015
}
},
{
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 7,
"args": {
"name": "stream 7 "
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 0, "tid": 7,
"args": {
"sort_index": 7
}
},
{
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515,
"args": {
"name": "thread 2340515 (pt_autograd_0)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515,
"args": {
"sort_index": 2340515
}
},
{
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515,
"args": {
"name": "thread 2340515 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2340515,
"args": {
"sort_index": 2340515
}
},
{
"name": "thread_name", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2337800,
"args": {
"name": "thread 2337800 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6071193017422.991, "pid": 2337800, "tid": 2337800,
"args": {
"sort_index": 2337800
}
},
{
"ph": "X", "cat": "Trace", "ts": 6071193017369.210, "dur": 439222.959,
"pid": "Spans", "tid": "PyTorch Profiler",
"name": "PyTorch Profiler (0)",
"args": {
"Op count": 0
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6071193017369.210,
"pid": "Spans", "tid": 0,
"args": {
"sort_index": 536870912
}
},
{
"name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g",
"pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6071193017369.210
},
{
"name": "Record Window End", "ph": "i", "s": "g",
"pid": "", "tid": "", "ts": 6071193457002.819
}
],
"traceName": "/tmp/trace.json"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment