Skip to content

Instantly share code, notes, and snippets.

@shunting314
Created June 11, 2025 22:00
Show Gist options
  • Save shunting314/6e4572c8c0592403e55424ad1535ae3d to your computer and use it in GitHub Desktop.
Save shunting314/6e4572c8c0592403e55424ad1535ae3d to your computer and use it in GitHub Desktop.
{
"schemaVersion": 1,
"deviceProperties": [
{
"id": 0, "name": "NVIDIA H100", "totalGlobalMem": 102010781696,
"computeMajor": 9, "computeMinor": 0,
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
"regsPerBlock": 65536, "warpSize": 32,
"sharedMemPerBlock": 49152, "numSms": 132
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
}
],
"cupti_version": 24,
"cuda_runtime_version": 12060,
"cuda_driver_version": 12020,
"trace_id": "9C4A4A89A5964FBFA96ADF67B310C25E",
"displayTimeUnit": "ms",
"baseTimeNanoseconds": 1743521598000000000,
"traceEvents": [
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602137737.466, "dur": 604.001,
"args": {
"External id": 513,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 0
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602137758.027, "dur": 533.565,
"args": {
"External id": 514,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 1
}
},
{
"ph": "f", "id": 1, "pid": 2537909, "tid": 2544200, "ts": 6157602137758.027,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200,
"ts": 6157602138102.076, "dur": 84.077,
"args": {
"External id": 515,"Record function id": 0, "Ev Idx": 2
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200,
"ts": 6157602138203.058, "dur": 13.671,
"args": {
"External id": 516,"Record function id": 0, "Ev Idx": 3
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200,
"ts": 6157602138249.288, "dur": 11.087,
"args": {
"External id": 517,"Record function id": 0, "Ev Idx": 4
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138364.051, "dur": 31.888,
"args": {
"External id": 518,"Record function id": 0, "Ev Idx": 5
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138367.096, "dur": 26.309,
"args": {
"External id": 519,"Record function id": 0, "Ev Idx": 6
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138378.573, "dur": 13.070,
"args": {
"External id": 520,"Record function id": 0, "Ev Idx": 7
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138381.327, "dur": 10.176,
"args": {
"External id": 521,"Record function id": 0, "Ev Idx": 8
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138400.827, "dur": 5.268,
"args": {
"External id": 522,"Record function id": 0, "Ev Idx": 9
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138402.549, "dur": 2.744,
"args": {
"External id": 523,"Record function id": 0, "Ev Idx": 10
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138403.250, "dur": 1.092,
"args": {
"External id": 524,"Record function id": 0, "Ev Idx": 11
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138403.521, "dur": 0.691,
"args": {
"External id": 525,"Record function id": 0, "Ev Idx": 12
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138410.351, "dur": 4.296,
"args": {
"External id": 526,"Record function id": 0, "Ev Idx": 13
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602138411.663, "dur": 2.143,
"args": {
"External id": 527,"Record function id": 0, "Ev Idx": 14
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138412.504, "dur": 0.841,
"args": {
"External id": 528,"Record function id": 0, "Ev Idx": 15
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602138412.694, "dur": 0.551,
"args": {
"External id": 529,"Record function id": 0, "Ev Idx": 16
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602139216.717, "dur": 187.873,
"args": {
"External id": 530,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 17
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602139219.601, "dur": 169.786,
"args": {
"External id": 531,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 18
}
},
{
"ph": "f", "id": 2, "pid": 2537909, "tid": 2544200, "ts": 6157602139219.601,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200,
"ts": 6157602139321.485, "dur": 21.272,
"args": {
"External id": 532,"Record function id": 0, "Ev Idx": 19
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200,
"ts": 6157602139349.838, "dur": 8.823,
"args": {
"External id": 533,"Record function id": 0, "Ev Idx": 20
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200,
"ts": 6157602139367.995, "dur": 8.483,
"args": {
"External id": 534,"Record function id": 0, "Ev Idx": 21
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139413.864, "dur": 8.603,
"args": {
"External id": 535,"Record function id": 0, "Ev Idx": 22
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139415.507, "dur": 5.819,
"args": {
"External id": 536,"Record function id": 0, "Ev Idx": 23
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139417.340, "dur": 3.164,
"args": {
"External id": 537,"Record function id": 0, "Ev Idx": 24
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139417.880, "dur": 2.504,
"args": {
"External id": 538,"Record function id": 0, "Ev Idx": 25
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139426.694, "dur": 3.816,
"args": {
"External id": 539,"Record function id": 0, "Ev Idx": 26
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139427.805, "dur": 2.003,
"args": {
"External id": 540,"Record function id": 0, "Ev Idx": 27
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139428.356, "dur": 1.122,
"args": {
"External id": 541,"Record function id": 0, "Ev Idx": 28
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139428.547, "dur": 0.831,
"args": {
"External id": 542,"Record function id": 0, "Ev Idx": 29
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139434.345, "dur": 3.716,
"args": {
"External id": 543,"Record function id": 0, "Ev Idx": 30
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602139435.467, "dur": 1.953,
"args": {
"External id": 544,"Record function id": 0, "Ev Idx": 31
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139436.218, "dur": 0.901,
"args": {
"External id": 545,"Record function id": 0, "Ev Idx": 32
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602139436.398, "dur": 0.621,
"args": {
"External id": 546,"Record function id": 0, "Ev Idx": 33
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602140139.529, "dur": 182.335,
"args": {
"External id": 547,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 1, "Ev Idx": 34
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602140141.973, "dur": 148.083,
"args": {
"External id": 548,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 1, "Ev Idx": 35
}
},
{
"ph": "f", "id": 3, "pid": 2537909, "tid": 2544200, "ts": 6157602140141.973,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200,
"ts": 6157602140230.666, "dur": 19.640,
"args": {
"External id": 549,"Record function id": 0, "Ev Idx": 36
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200,
"ts": 6157602140255.994, "dur": 7.392,
"args": {
"External id": 550,"Record function id": 0, "Ev Idx": 37
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200,
"ts": 6157602140271.858, "dur": 6.921,
"args": {
"External id": 551,"Record function id": 0, "Ev Idx": 38
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140331.288, "dur": 7.291,
"args": {
"External id": 552,"Record function id": 0, "Ev Idx": 39
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140332.860, "dur": 4.808,
"args": {
"External id": 553,"Record function id": 0, "Ev Idx": 40
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140334.453, "dur": 2.333,
"args": {
"External id": 554,"Record function id": 0, "Ev Idx": 41
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140334.974, "dur": 1.692,
"args": {
"External id": 555,"Record function id": 0, "Ev Idx": 42
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140342.725, "dur": 3.526,
"args": {
"External id": 556,"Record function id": 0, "Ev Idx": 43
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140343.877, "dur": 1.703,
"args": {
"External id": 557,"Record function id": 0, "Ev Idx": 44
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140344.418, "dur": 0.851,
"args": {
"External id": 558,"Record function id": 0, "Ev Idx": 45
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140344.608, "dur": 0.571,
"args": {
"External id": 559,"Record function id": 0, "Ev Idx": 46
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140350.006, "dur": 3.666,
"args": {
"External id": 560,"Record function id": 0, "Ev Idx": 47
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602140351.098, "dur": 1.893,
"args": {
"External id": 561,"Record function id": 0, "Ev Idx": 48
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140351.859, "dur": 0.821,
"args": {
"External id": 562,"Record function id": 0, "Ev Idx": 49
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602140352.049, "dur": 0.531,
"args": {
"External id": 563,"Record function id": 0, "Ev Idx": 50
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602141048.339, "dur": 165.029,
"args": {
"External id": 564,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 1, "Ev Idx": 51
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602141050.823, "dur": 149.005,
"args": {
"External id": 565,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 1, "Ev Idx": 52
}
},
{
"ph": "f", "id": 4, "pid": 2537909, "tid": 2544200, "ts": 6157602141050.823,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200,
"ts": 6157602141137.604, "dur": 19.850,
"args": {
"External id": 566,"Record function id": 0, "Ev Idx": 53
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200,
"ts": 6157602141163.383, "dur": 8.022,
"args": {
"External id": 567,"Record function id": 0, "Ev Idx": 54
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200,
"ts": 6157602141180.108, "dur": 7.391,
"args": {
"External id": 568,"Record function id": 0, "Ev Idx": 55
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141222.252, "dur": 6.910,
"args": {
"External id": 569,"Record function id": 0, "Ev Idx": 56
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141223.664, "dur": 4.547,
"args": {
"External id": 570,"Record function id": 0, "Ev Idx": 57
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141225.056, "dur": 2.604,
"args": {
"External id": 571,"Record function id": 0, "Ev Idx": 58
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141225.517, "dur": 2.023,
"args": {
"External id": 572,"Record function id": 0, "Ev Idx": 59
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141233.158, "dur": 3.586,
"args": {
"External id": 573,"Record function id": 0, "Ev Idx": 60
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141234.250, "dur": 1.763,
"args": {
"External id": 574,"Record function id": 0, "Ev Idx": 61
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141234.771, "dur": 0.931,
"args": {
"External id": 575,"Record function id": 0, "Ev Idx": 62
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141234.981, "dur": 0.621,
"args": {
"External id": 576,"Record function id": 0, "Ev Idx": 63
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141240.640, "dur": 3.695,
"args": {
"External id": 577,"Record function id": 0, "Ev Idx": 64
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602141241.751, "dur": 1.773,
"args": {
"External id": 578,"Record function id": 0, "Ev Idx": 65
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141242.562, "dur": 0.661,
"args": {
"External id": 579,"Record function id": 0, "Ev Idx": 66
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602141242.773, "dur": 0.350,
"args": {
"External id": 580,"Record function id": 0, "Ev Idx": 67
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602141950.821, "dur": 165.509,
"args": {
"External id": 581,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 68
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200,
"ts": 6157602141953.054, "dur": 149.405,
"args": {
"External id": 582,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 69
}
},
{
"ph": "f", "id": 5, "pid": 2537909, "tid": 2544200, "ts": 6157602141953.054,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200,
"ts": 6157602142042.218, "dur": 19.079,
"args": {
"External id": 583,"Record function id": 0, "Ev Idx": 70
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200,
"ts": 6157602142067.226, "dur": 7.822,
"args": {
"External id": 584,"Record function id": 0, "Ev Idx": 71
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200,
"ts": 6157602142083.020, "dur": 7.812,
"args": {
"External id": 585,"Record function id": 0, "Ev Idx": 72
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142125.103, "dur": 6.640,
"args": {
"External id": 586,"Record function id": 0, "Ev Idx": 73
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142126.385, "dur": 4.337,
"args": {
"External id": 587,"Record function id": 0, "Ev Idx": 74
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142127.797, "dur": 2.454,
"args": {
"External id": 588,"Record function id": 0, "Ev Idx": 75
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142128.358, "dur": 1.773,
"args": {
"External id": 589,"Record function id": 0, "Ev Idx": 76
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142136.431, "dur": 3.555,
"args": {
"External id": 590,"Record function id": 0, "Ev Idx": 77
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142137.482, "dur": 1.813,
"args": {
"External id": 591,"Record function id": 0, "Ev Idx": 78
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142137.993, "dur": 0.841,
"args": {
"External id": 592,"Record function id": 0, "Ev Idx": 79
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142138.163, "dur": 0.571,
"args": {
"External id": 593,"Record function id": 0, "Ev Idx": 80
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142143.742, "dur": 3.615,
"args": {
"External id": 594,"Record function id": 0, "Ev Idx": 81
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200,
"ts": 6157602142144.863, "dur": 1.863,
"args": {
"External id": 595,"Record function id": 0, "Ev Idx": 82
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142145.594, "dur": 0.852,
"args": {
"External id": 596,"Record function id": 0, "Ev Idx": 83
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200,
"ts": 6157602142145.805, "dur": 0.550,
"args": {
"External id": 597,"Record function id": 0, "Ev Idx": 84
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 0", "pid": 2537909, "tid": 2537909,
"ts": 6157602134832.545, "dur": 3689.324,
"args": {
"External id": 1,"Record function id": 0, "Ev Idx": 85
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909,
"ts": 6157602134895.550, "dur": 67.402,
"args": {
"External id": 2,"Record function id": 0, "Ev Idx": 86
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909,
"ts": 6157602134966.146, "dur": 3489.453,
"args": {
"External id": 3,"Record function id": 0, "Ev Idx": 87
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909,
"ts": 6157602134982.221, "dur": 5.037,
"args": {
"External id": 4,"Record function id": 0, "Ev Idx": 88
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909,
"ts": 6157602135062.402, "dur": 2277.555,
"args": {
"External id": 5,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 0, "Ev Idx": 89
}
},
{
"ph": "s", "id": 1, "pid": 2537909, "tid": 2537909, "ts": 6157602135062.402,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909,
"ts": 6157602136439.418, "dur": 67.392,
"args": {
"External id": 6,"Record function id": 0, "Ev Idx": 90
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909,
"ts": 6157602136528.713, "dur": 14.622,
"args": {
"External id": 7,"Record function id": 0, "Ev Idx": 91
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909,
"ts": 6157602136564.988, "dur": 13.400,
"args": {
"External id": 8,"Record function id": 0, "Ev Idx": 92
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602136598.328, "dur": 43.376,
"args": {
"External id": 9,"Record function id": 0, "Ev Idx": 93
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602136608.584, "dur": 30.466,
"args": {
"External id": 10,"Record function id": 0, "Ev Idx": 94
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602136615.214, "dur": 18.948,
"args": {
"External id": 11,"Record function id": 0, "Ev Idx": 95
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602136624.958, "dur": 8.773,
"args": {
"External id": 12,"Record function id": 0, "Ev Idx": 96
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602136628.484, "dur": 3.625,
"args": {
"External id": 13,"Record function id": 0, "Ev Idx": 97
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602136651.659, "dur": 3.525,
"args": {
"External id": 14,"Record function id": 0, "Ev Idx": 98
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602136652.039, "dur": 2.864,
"args": {
"External id": 15,"Record function id": 0, "Ev Idx": 99
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602136652.530, "dur": 1.612,
"args": {
"External id": 16,"Record function id": 0, "Ev Idx": 100
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602136652.981, "dur": 0.991,
"args": {
"External id": 17,"Record function id": 0, "Ev Idx": 101
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602136653.411, "dur": 0.311,
"args": {
"External id": 18,"Record function id": 0, "Ev Idx": 102
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602136658.539, "dur": 2.964,
"args": {
"External id": 19,"Record function id": 0, "Ev Idx": 103
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602136658.839, "dur": 2.434,
"args": {
"External id": 20,"Record function id": 0, "Ev Idx": 104
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602136659.290, "dur": 1.192,
"args": {
"External id": 21,"Record function id": 0, "Ev Idx": 105
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602136659.611, "dur": 0.741,
"args": {
"External id": 22,"Record function id": 0, "Ev Idx": 106
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602136659.911, "dur": 0.260,
"args": {
"External id": 23,"Record function id": 0, "Ev Idx": 107
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602136664.658, "dur": 2.734,
"args": {
"External id": 24,"Record function id": 0, "Ev Idx": 108
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602136664.959, "dur": 2.213,
"args": {
"External id": 25,"Record function id": 0, "Ev Idx": 109
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602136665.319, "dur": 1.082,
"args": {
"External id": 26,"Record function id": 0, "Ev Idx": 110
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602136665.640, "dur": 0.651,
"args": {
"External id": 27,"Record function id": 0, "Ev Idx": 111
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602136665.910, "dur": 0.210,
"args": {
"External id": 28,"Record function id": 0, "Ev Idx": 112
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909,
"ts": 6157602136684.228, "dur": 13.360,
"args": {
"External id": 29,"Record function id": 0, "Ev Idx": 113
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909,
"ts": 6157602136710.828, "dur": 10.936,
"args": {
"External id": 30,"Record function id": 0, "Ev Idx": 114
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909,
"ts": 6157602136734.103, "dur": 13.240,
"args": {
"External id": 31,"Record function id": 0, "Ev Idx": 115
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909,
"ts": 6157602136785.701, "dur": 14.782,
"args": {
"External id": 32,"Record function id": 0, "Ev Idx": 116
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602136812.121, "dur": 106.931,
"args": {
"External id": 33,"Record function id": 0, "Ev Idx": 117
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602136928.987, "dur": 4.737,
"args": {
"External id": 34,"Record function id": 0, "Ev Idx": 118
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909,
"ts": 6157602136953.654, "dur": 23.355,
"args": {
"External id": 35,"Record function id": 0, "Ev Idx": 119
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909,
"ts": 6157602136991.711, "dur": 11.257,
"args": {
"External id": 36,"Record function id": 0, "Ev Idx": 120
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602137008.467, "dur": 31.507,
"args": {
"External id": 37,"Record function id": 0, "Ev Idx": 121
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909,
"ts": 6157602137058.913, "dur": 15.553,
"args": {
"External id": 38,"Record function id": 0, "Ev Idx": 122
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909,
"ts": 6157602137087.326, "dur": 101.903,
"args": {
"External id": 39,"Record function id": 0, "Ev Idx": 123
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909,
"ts": 6157602137220.446, "dur": 14.472,
"args": {
"External id": 40,"Record function id": 0, "Ev Idx": 124
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909,
"ts": 6157602137249.871, "dur": 10.846,
"args": {
"External id": 41,"Record function id": 0, "Ev Idx": 125
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602137401.890, "dur": 42.925,
"args": {
"External id": 42,"Record function id": 0, "Ev Idx": 126
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602137403.963, "dur": 14.322,
"args": {
"External id": 43,"Record function id": 0, "Ev Idx": 127
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602137410.293, "dur": 7.451,
"args": {
"External id": 44,"Record function id": 0, "Ev Idx": 128
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909,
"ts": 6157602137423.082, "dur": 21.492,
"args": {
"External id": 45,"Record function id": 0, "Ev Idx": 129
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 1", "pid": 2537909, "tid": 2537909,
"ts": 6157602138535.910, "dur": 964.845,
"args": {
"External id": 46,"Record function id": 0, "Ev Idx": 130
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909,
"ts": 6157602138560.968, "dur": 22.885,
"args": {
"External id": 47,"Record function id": 0, "Ev Idx": 131
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909,
"ts": 6157602138584.474, "dur": 880.317,
"args": {
"External id": 48,"Record function id": 0, "Ev Idx": 132
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909,
"ts": 6157602138590.062, "dur": 1.753,
"args": {
"External id": 49,"Record function id": 0, "Ev Idx": 133
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909,
"ts": 6157602138622.571, "dur": 513.725,
"args": {
"External id": 50,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 0, "Ev Idx": 134
}
},
{
"ph": "s", "id": 2, "pid": 2537909, "tid": 2537909, "ts": 6157602138622.571,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909,
"ts": 6157602138683.313, "dur": 26.900,
"args": {
"External id": 51,"Record function id": 0, "Ev Idx": 135
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909,
"ts": 6157602138720.138, "dur": 10.045,
"args": {
"External id": 52,"Record function id": 0, "Ev Idx": 136
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909,
"ts": 6157602138738.967, "dur": 8.232,
"args": {
"External id": 53,"Record function id": 0, "Ev Idx": 137
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602138755.672, "dur": 12.078,
"args": {
"External id": 54,"Record function id": 0, "Ev Idx": 138
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602138757.404, "dur": 9.405,
"args": {
"External id": 55,"Record function id": 0, "Ev Idx": 139
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602138759.638, "dur": 5.478,
"args": {
"External id": 56,"Record function id": 0, "Ev Idx": 140
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602138761.250, "dur": 3.535,
"args": {
"External id": 57,"Record function id": 0, "Ev Idx": 141
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602138762.422, "dur": 1.713,
"args": {
"External id": 58,"Record function id": 0, "Ev Idx": 142
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602138774.039, "dur": 3.656,
"args": {
"External id": 59,"Record function id": 0, "Ev Idx": 143
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602138774.530, "dur": 2.844,
"args": {
"External id": 60,"Record function id": 0, "Ev Idx": 144
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602138774.971, "dur": 1.833,
"args": {
"External id": 61,"Record function id": 0, "Ev Idx": 145
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602138775.341, "dur": 1.272,
"args": {
"External id": 62,"Record function id": 0, "Ev Idx": 146
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602138775.812, "dur": 0.541,
"args": {
"External id": 63,"Record function id": 0, "Ev Idx": 147
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602138780.850, "dur": 3.154,
"args": {
"External id": 64,"Record function id": 0, "Ev Idx": 148
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602138781.150, "dur": 2.604,
"args": {
"External id": 65,"Record function id": 0, "Ev Idx": 149
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602138781.531, "dur": 1.562,
"args": {
"External id": 66,"Record function id": 0, "Ev Idx": 150
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602138781.911, "dur": 1.042,
"args": {
"External id": 67,"Record function id": 0, "Ev Idx": 151
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602138782.322, "dur": 0.431,
"args": {
"External id": 68,"Record function id": 0, "Ev Idx": 152
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602138786.929, "dur": 2.714,
"args": {
"External id": 69,"Record function id": 0, "Ev Idx": 153
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602138787.279, "dur": 2.134,
"args": {
"External id": 70,"Record function id": 0, "Ev Idx": 154
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602138787.620, "dur": 1.352,
"args": {
"External id": 71,"Record function id": 0, "Ev Idx": 155
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602138787.940, "dur": 0.872,
"args": {
"External id": 72,"Record function id": 0, "Ev Idx": 156
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602138788.211, "dur": 0.430,
"args": {
"External id": 73,"Record function id": 0, "Ev Idx": 157
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909,
"ts": 6157602138800.559, "dur": 9.314,
"args": {
"External id": 74,"Record function id": 0, "Ev Idx": 158
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909,
"ts": 6157602138816.884, "dur": 7.161,
"args": {
"External id": 75,"Record function id": 0, "Ev Idx": 159
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909,
"ts": 6157602138830.465, "dur": 6.880,
"args": {
"External id": 76,"Record function id": 0, "Ev Idx": 160
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909,
"ts": 6157602138861.942, "dur": 10.235,
"args": {
"External id": 77,"Record function id": 0, "Ev Idx": 161
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602138876.865, "dur": 39.028,
"args": {
"External id": 78,"Record function id": 0, "Ev Idx": 162
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602138923.144, "dur": 2.935,
"args": {
"External id": 79,"Record function id": 0, "Ev Idx": 163
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909,
"ts": 6157602138937.756, "dur": 14.903,
"args": {
"External id": 80,"Record function id": 0, "Ev Idx": 164
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909,
"ts": 6157602138961.041, "dur": 7.682,
"args": {
"External id": 81,"Record function id": 0, "Ev Idx": 165
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602138973.981, "dur": 22.023,
"args": {
"External id": 82,"Record function id": 0, "Ev Idx": 166
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909,
"ts": 6157602139007.992, "dur": 9.294,
"args": {
"External id": 83,"Record function id": 0, "Ev Idx": 167
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909,
"ts": 6157602139024.477, "dur": 34.993,
"args": {
"External id": 84,"Record function id": 0, "Ev Idx": 168
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909,
"ts": 6157602139078.609, "dur": 10.175,
"args": {
"External id": 85,"Record function id": 0, "Ev Idx": 169
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909,
"ts": 6157602139095.604, "dur": 7.532,
"args": {
"External id": 86,"Record function id": 0, "Ev Idx": 170
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602139162.445, "dur": 18.027,
"args": {
"External id": 87,"Record function id": 0, "Ev Idx": 171
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602139163.066, "dur": 5.529,
"args": {
"External id": 88,"Record function id": 0, "Ev Idx": 172
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139163.927, "dur": 4.237,
"args": {
"External id": 89,"Record function id": 0, "Ev Idx": 173
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909,
"ts": 6157602139169.245, "dur": 10.987,
"args": {
"External id": 90,"Record function id": 0, "Ev Idx": 174
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 2", "pid": 2537909, "tid": 2537909,
"ts": 6157602139510.931, "dur": 902.441,
"args": {
"External id": 91,"Record function id": 0, "Ev Idx": 175
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909,
"ts": 6157602139530.260, "dur": 14.562,
"args": {
"External id": 92,"Record function id": 0, "Ev Idx": 176
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909,
"ts": 6157602139545.353, "dur": 834.088,
"args": {
"External id": 93,"Record function id": 0, "Ev Idx": 177
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909,
"ts": 6157602139549.940, "dur": 1.852,
"args": {
"External id": 94,"Record function id": 0, "Ev Idx": 178
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909,
"ts": 6157602139577.691, "dur": 486.835,
"args": {
"External id": 95,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 0, "Ev Idx": 179
}
},
{
"ph": "s", "id": 3, "pid": 2537909, "tid": 2537909, "ts": 6157602139577.691,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909,
"ts": 6157602139633.736, "dur": 24.968,
"args": {
"External id": 96,"Record function id": 0, "Ev Idx": 180
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909,
"ts": 6157602139666.906, "dur": 9.835,
"args": {
"External id": 97,"Record function id": 0, "Ev Idx": 181
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909,
"ts": 6157602139685.103, "dur": 7.832,
"args": {
"External id": 98,"Record function id": 0, "Ev Idx": 182
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602139701.128, "dur": 11.497,
"args": {
"External id": 99,"Record function id": 0, "Ev Idx": 183
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602139702.800, "dur": 8.844,
"args": {
"External id": 100,"Record function id": 0, "Ev Idx": 184
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602139704.843, "dur": 5.278,
"args": {
"External id": 101,"Record function id": 0, "Ev Idx": 185
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602139706.436, "dur": 3.355,
"args": {
"External id": 102,"Record function id": 0, "Ev Idx": 186
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139707.587, "dur": 1.573,
"args": {
"External id": 103,"Record function id": 0, "Ev Idx": 187
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602139719.485, "dur": 3.566,
"args": {
"External id": 104,"Record function id": 0, "Ev Idx": 188
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602139719.886, "dur": 2.704,
"args": {
"External id": 105,"Record function id": 0, "Ev Idx": 189
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602139720.357, "dur": 1.692,
"args": {
"External id": 106,"Record function id": 0, "Ev Idx": 190
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602139720.757, "dur": 1.112,
"args": {
"External id": 107,"Record function id": 0, "Ev Idx": 191
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139721.108, "dur": 0.531,
"args": {
"External id": 108,"Record function id": 0, "Ev Idx": 192
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602139726.175, "dur": 2.855,
"args": {
"External id": 109,"Record function id": 0, "Ev Idx": 193
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602139726.476, "dur": 2.243,
"args": {
"External id": 110,"Record function id": 0, "Ev Idx": 194
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602139726.997, "dur": 1.211,
"args": {
"External id": 111,"Record function id": 0, "Ev Idx": 195
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602139727.337, "dur": 0.741,
"args": {
"External id": 112,"Record function id": 0, "Ev Idx": 196
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139727.658, "dur": 0.240,
"args": {
"External id": 113,"Record function id": 0, "Ev Idx": 197
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602139732.124, "dur": 2.424,
"args": {
"External id": 114,"Record function id": 0, "Ev Idx": 198
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602139732.415, "dur": 1.923,
"args": {
"External id": 115,"Record function id": 0, "Ev Idx": 199
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602139732.735, "dur": 1.102,
"args": {
"External id": 116,"Record function id": 0, "Ev Idx": 200
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602139733.066, "dur": 0.651,
"args": {
"External id": 117,"Record function id": 0, "Ev Idx": 201
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139733.346, "dur": 0.191,
"args": {
"External id": 118,"Record function id": 0, "Ev Idx": 202
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909,
"ts": 6157602139744.403, "dur": 9.214,
"args": {
"External id": 119,"Record function id": 0, "Ev Idx": 203
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909,
"ts": 6157602139759.996, "dur": 7.542,
"args": {
"External id": 120,"Record function id": 0, "Ev Idx": 204
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909,
"ts": 6157602139774.558, "dur": 6.701,
"args": {
"External id": 121,"Record function id": 0, "Ev Idx": 205
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909,
"ts": 6157602139805.064, "dur": 9.294,
"args": {
"External id": 122,"Record function id": 0, "Ev Idx": 206
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602139818.765, "dur": 33.851,
"args": {
"External id": 123,"Record function id": 0, "Ev Idx": 207
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602139859.396, "dur": 2.744,
"args": {
"External id": 124,"Record function id": 0, "Ev Idx": 208
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909,
"ts": 6157602139873.448, "dur": 14.722,
"args": {
"External id": 125,"Record function id": 0, "Ev Idx": 209
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909,
"ts": 6157602139896.472, "dur": 7.121,
"args": {
"External id": 126,"Record function id": 0, "Ev Idx": 210
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602139908.470, "dur": 20.992,
"args": {
"External id": 127,"Record function id": 0, "Ev Idx": 211
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909,
"ts": 6157602139941.170, "dur": 9.584,
"args": {
"External id": 128,"Record function id": 0, "Ev Idx": 212
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909,
"ts": 6157602139957.454, "dur": 33.280,
"args": {
"External id": 129,"Record function id": 0, "Ev Idx": 213
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909,
"ts": 6157602140009.573, "dur": 10.496,
"args": {
"External id": 130,"Record function id": 0, "Ev Idx": 214
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909,
"ts": 6157602140026.268, "dur": 7.461,
"args": {
"External id": 131,"Record function id": 0, "Ev Idx": 215
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602140088.332, "dur": 17.837,
"args": {
"External id": 132,"Record function id": 0, "Ev Idx": 216
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602140089.063, "dur": 5.658,
"args": {
"External id": 133,"Record function id": 0, "Ev Idx": 217
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140089.994, "dur": 4.347,
"args": {
"External id": 134,"Record function id": 0, "Ev Idx": 218
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909,
"ts": 6157602140095.332, "dur": 10.606,
"args": {
"External id": 135,"Record function id": 0, "Ev Idx": 219
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 3", "pid": 2537909, "tid": 2537909,
"ts": 6157602140423.197, "dur": 889.161,
"args": {
"External id": 136,"Record function id": 0, "Ev Idx": 220
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909,
"ts": 6157602140442.916, "dur": 15.454,
"args": {
"External id": 137,"Record function id": 0, "Ev Idx": 221
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909,
"ts": 6157602140459.241, "dur": 812.065,
"args": {
"External id": 138,"Record function id": 0, "Ev Idx": 222
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909,
"ts": 6157602140463.227, "dur": 1.602,
"args": {
"External id": 139,"Record function id": 0, "Ev Idx": 223
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909,
"ts": 6157602140490.488, "dur": 480.174,
"args": {
"External id": 140,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 0, "Ev Idx": 224
}
},
{
"ph": "s", "id": 4, "pid": 2537909, "tid": 2537909, "ts": 6157602140490.488,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909,
"ts": 6157602140543.789, "dur": 25.198,
"args": {
"External id": 141,"Record function id": 0, "Ev Idx": 225
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909,
"ts": 6157602140578.261, "dur": 9.444,
"args": {
"External id": 142,"Record function id": 0, "Ev Idx": 226
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909,
"ts": 6157602140595.947, "dur": 7.642,
"args": {
"External id": 143,"Record function id": 0, "Ev Idx": 227
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602140611.431, "dur": 11.116,
"args": {
"External id": 144,"Record function id": 0, "Ev Idx": 228
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602140613.143, "dur": 8.553,
"args": {
"External id": 145,"Record function id": 0, "Ev Idx": 229
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602140615.026, "dur": 5.178,
"args": {
"External id": 146,"Record function id": 0, "Ev Idx": 230
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602140616.819, "dur": 3.074,
"args": {
"External id": 147,"Record function id": 0, "Ev Idx": 231
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140617.860, "dur": 1.422,
"args": {
"External id": 148,"Record function id": 0, "Ev Idx": 232
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602140629.197, "dur": 3.456,
"args": {
"External id": 149,"Record function id": 0, "Ev Idx": 233
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602140629.598, "dur": 2.774,
"args": {
"External id": 150,"Record function id": 0, "Ev Idx": 234
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602140630.039, "dur": 1.802,
"args": {
"External id": 151,"Record function id": 0, "Ev Idx": 235
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602140630.469, "dur": 1.172,
"args": {
"External id": 152,"Record function id": 0, "Ev Idx": 236
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140630.820, "dur": 0.571,
"args": {
"External id": 153,"Record function id": 0, "Ev Idx": 237
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602140635.797, "dur": 2.714,
"args": {
"External id": 154,"Record function id": 0, "Ev Idx": 238
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602140636.078, "dur": 2.183,
"args": {
"External id": 155,"Record function id": 0, "Ev Idx": 239
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602140636.488, "dur": 1.282,
"args": {
"External id": 156,"Record function id": 0, "Ev Idx": 240
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602140636.829, "dur": 0.811,
"args": {
"External id": 157,"Record function id": 0, "Ev Idx": 241
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140637.189, "dur": 0.271,
"args": {
"External id": 158,"Record function id": 0, "Ev Idx": 242
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602140641.376, "dur": 2.704,
"args": {
"External id": 159,"Record function id": 0, "Ev Idx": 243
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602140641.676, "dur": 2.193,
"args": {
"External id": 160,"Record function id": 0, "Ev Idx": 244
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602140642.017, "dur": 1.382,
"args": {
"External id": 161,"Record function id": 0, "Ev Idx": 245
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602140642.357, "dur": 0.892,
"args": {
"External id": 162,"Record function id": 0, "Ev Idx": 246
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140642.648, "dur": 0.430,
"args": {
"External id": 163,"Record function id": 0, "Ev Idx": 247
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909,
"ts": 6157602140653.344, "dur": 8.563,
"args": {
"External id": 164,"Record function id": 0, "Ev Idx": 248
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909,
"ts": 6157602140668.306, "dur": 6.951,
"args": {
"External id": 165,"Record function id": 0, "Ev Idx": 249
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909,
"ts": 6157602140681.947, "dur": 6.920,
"args": {
"External id": 166,"Record function id": 0, "Ev Idx": 250
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909,
"ts": 6157602140713.685, "dur": 8.032,
"args": {
"External id": 167,"Record function id": 0, "Ev Idx": 251
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602140725.823, "dur": 36.505,
"args": {
"External id": 168,"Record function id": 0, "Ev Idx": 252
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140769.118, "dur": 2.835,
"args": {
"External id": 169,"Record function id": 0, "Ev Idx": 253
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909,
"ts": 6157602140783.460, "dur": 13.841,
"args": {
"External id": 170,"Record function id": 0, "Ev Idx": 254
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909,
"ts": 6157602140804.181, "dur": 6.460,
"args": {
"External id": 171,"Record function id": 0, "Ev Idx": 255
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602140815.699, "dur": 21.152,
"args": {
"External id": 172,"Record function id": 0, "Ev Idx": 256
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909,
"ts": 6157602140848.027, "dur": 8.714,
"args": {
"External id": 173,"Record function id": 0, "Ev Idx": 257
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909,
"ts": 6157602140863.411, "dur": 33.280,
"args": {
"External id": 174,"Record function id": 0, "Ev Idx": 258
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909,
"ts": 6157602140914.908, "dur": 10.646,
"args": {
"External id": 175,"Record function id": 0, "Ev Idx": 259
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909,
"ts": 6157602140932.094, "dur": 7.822,
"args": {
"External id": 176,"Record function id": 0, "Ev Idx": 260
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602140995.400, "dur": 18.057,
"args": {
"External id": 177,"Record function id": 0, "Ev Idx": 261
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602140996.011, "dur": 5.728,
"args": {
"External id": 178,"Record function id": 0, "Ev Idx": 262
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602140996.952, "dur": 4.357,
"args": {
"External id": 179,"Record function id": 0, "Ev Idx": 263
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909,
"ts": 6157602141002.340, "dur": 10.897,
"args": {
"External id": 180,"Record function id": 0, "Ev Idx": 264
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 4", "pid": 2537909, "tid": 2537909,
"ts": 6157602141322.403, "dur": 884.123,
"args": {
"External id": 181,"Record function id": 0, "Ev Idx": 265
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909,
"ts": 6157602141341.171, "dur": 15.674,
"args": {
"External id": 182,"Record function id": 0, "Ev Idx": 266
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909,
"ts": 6157602141357.386, "dur": 815.660,
"args": {
"External id": 183,"Record function id": 0, "Ev Idx": 267
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909,
"ts": 6157602141361.312, "dur": 1.382,
"args": {
"External id": 184,"Record function id": 0, "Ev Idx": 268
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909,
"ts": 6157602141388.873, "dur": 483.830,
"args": {
"External id": 185,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 0, "Ev Idx": 269
}
},
{
"ph": "s", "id": 5, "pid": 2537909, "tid": 2537909, "ts": 6157602141388.873,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909,
"ts": 6157602141442.574, "dur": 24.597,
"args": {
"External id": 186,"Record function id": 0, "Ev Idx": 270
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909,
"ts": 6157602141476.425, "dur": 9.014,
"args": {
"External id": 187,"Record function id": 0, "Ev Idx": 271
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909,
"ts": 6157602141493.992, "dur": 8.232,
"args": {
"External id": 188,"Record function id": 0, "Ev Idx": 272
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602141510.096, "dur": 10.956,
"args": {
"External id": 189,"Record function id": 0, "Ev Idx": 273
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602141511.618, "dur": 8.603,
"args": {
"External id": 190,"Record function id": 0, "Ev Idx": 274
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602141513.391, "dur": 5.328,
"args": {
"External id": 191,"Record function id": 0, "Ev Idx": 275
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602141515.404, "dur": 2.964,
"args": {
"External id": 192,"Record function id": 0, "Ev Idx": 276
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141516.566, "dur": 1.161,
"args": {
"External id": 193,"Record function id": 0, "Ev Idx": 277
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602141527.572, "dur": 3.676,
"args": {
"External id": 194,"Record function id": 0, "Ev Idx": 278
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602141527.973, "dur": 2.964,
"args": {
"External id": 195,"Record function id": 0, "Ev Idx": 279
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602141528.404, "dur": 1.993,
"args": {
"External id": 196,"Record function id": 0, "Ev Idx": 280
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602141528.954, "dur": 1.272,
"args": {
"External id": 197,"Record function id": 0, "Ev Idx": 281
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141529.395, "dur": 0.591,
"args": {
"External id": 198,"Record function id": 0, "Ev Idx": 282
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602141534.543, "dur": 2.514,
"args": {
"External id": 199,"Record function id": 0, "Ev Idx": 283
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602141534.833, "dur": 2.013,
"args": {
"External id": 200,"Record function id": 0, "Ev Idx": 284
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602141535.214, "dur": 1.121,
"args": {
"External id": 201,"Record function id": 0, "Ev Idx": 285
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602141535.554, "dur": 0.651,
"args": {
"External id": 202,"Record function id": 0, "Ev Idx": 286
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141535.835, "dur": 0.200,
"args": {
"External id": 203,"Record function id": 0, "Ev Idx": 287
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909,
"ts": 6157602141539.901, "dur": 2.864,
"args": {
"External id": 204,"Record function id": 0, "Ev Idx": 288
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909,
"ts": 6157602141540.261, "dur": 2.284,
"args": {
"External id": 205,"Record function id": 0, "Ev Idx": 289
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909,
"ts": 6157602141540.582, "dur": 1.512,
"args": {
"External id": 206,"Record function id": 0, "Ev Idx": 290
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909,
"ts": 6157602141541.083, "dur": 0.861,
"args": {
"External id": 207,"Record function id": 0, "Ev Idx": 291
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141541.393, "dur": 0.391,
"args": {
"External id": 208,"Record function id": 0, "Ev Idx": 292
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909,
"ts": 6157602141553.311, "dur": 9.044,
"args": {
"External id": 209,"Record function id": 0, "Ev Idx": 293
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909,
"ts": 6157602141570.006, "dur": 7.592,
"args": {
"External id": 210,"Record function id": 0, "Ev Idx": 294
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909,
"ts": 6157602141583.687, "dur": 5.919,
"args": {
"External id": 211,"Record function id": 0, "Ev Idx": 295
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909,
"ts": 6157602141613.792, "dur": 9.064,
"args": {
"External id": 212,"Record function id": 0, "Ev Idx": 296
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602141627.523, "dur": 32.940,
"args": {
"External id": 213,"Record function id": 0, "Ev Idx": 297
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141667.233, "dur": 2.854,
"args": {
"External id": 214,"Record function id": 0, "Ev Idx": 298
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909,
"ts": 6157602141681.514, "dur": 15.424,
"args": {
"External id": 215,"Record function id": 0, "Ev Idx": 299
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909,
"ts": 6157602141704.259, "dur": 7.100,
"args": {
"External id": 216,"Record function id": 0, "Ev Idx": 300
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909,
"ts": 6157602141716.647, "dur": 20.802,
"args": {
"External id": 217,"Record function id": 0, "Ev Idx": 301
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909,
"ts": 6157602141748.686, "dur": 8.613,
"args": {
"External id": 218,"Record function id": 0, "Ev Idx": 302
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909,
"ts": 6157602141763.809, "dur": 32.809,
"args": {
"External id": 219,"Record function id": 0, "Ev Idx": 303
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909,
"ts": 6157602141816.368, "dur": 10.195,
"args": {
"External id": 220,"Record function id": 0, "Ev Idx": 304
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909,
"ts": 6157602141833.954, "dur": 7.391,
"args": {
"External id": 221,"Record function id": 0, "Ev Idx": 305
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602141899.603, "dur": 17.186,
"args": {
"External id": 222,"Record function id": 0, "Ev Idx": 306
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909,
"ts": 6157602141900.244, "dur": 5.378,
"args": {
"External id": 223,"Record function id": 0, "Ev Idx": 307
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909,
"ts": 6157602141901.066, "dur": 4.136,
"args": {
"External id": 224,"Record function id": 0, "Ev Idx": 308
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909,
"ts": 6157602141906.223, "dur": 10.346,
"args": {
"External id": 225,"Record function id": 0, "Ev Idx": 309
}
},
{
"ph": "X", "cat": "overhead", "name": "Unrecognized", "pid": -1, "tid": 0,
"ts": 6157602135110.995, "dur": 1241.643
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6157602136504.305, "dur": 7.359,
"args": {
"External id": 6, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 30, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 30, "pid": 0, "tid": 7, "ts": 6157602136504.305,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136469.364, "dur": 36.254,
"args": {
"External id": 6, "cbid": 307, "correlation": 30
}
},
{
"ph": "s", "id": 30, "pid": 2537909, "tid": 2537909, "ts": 6157602136469.364,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6157602136543.888, "dur": 1.441,
"args": {
"External id": 7, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 38, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 38, "pid": 0, "tid": 7, "ts": 6157602136543.888,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136536.084, "dur": 6.340,
"args": {
"External id": 7, "cbid": 307, "correlation": 38
}
},
{
"ph": "s", "id": 38, "pid": 2537909, "tid": 2537909, "ts": 6157602136536.084,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602136579.248, "dur": 47.200,
"args": {
"External id": 8, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 46, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 46, "pid": 0, "tid": 7, "ts": 6157602136579.248,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136572.539, "dur": 5.108,
"args": {
"External id": 8, "cbid": 307, "correlation": 46
}
},
{
"ph": "s", "id": 46, "pid": 2537909, "tid": 2537909, "ts": 6157602136572.539,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6157602136698.897, "dur": 1.536,
"args": {
"External id": 29, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 53, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 53, "pid": 0, "tid": 7, "ts": 6157602136698.897,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136690.607, "dur": 6.190,
"args": {
"External id": 29, "cbid": 307, "correlation": 53
}
},
{
"ph": "s", "id": 53, "pid": 2537909, "tid": 2537909, "ts": 6157602136690.607,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6157602136722.865, "dur": 31.520,
"args": {
"External id": 30, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 60, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 60, "pid": 0, "tid": 7, "ts": 6157602136722.865,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136716.707, "dur": 4.546,
"args": {
"External id": 30, "cbid": 307, "correlation": 60
}
},
{
"ph": "s", "id": 60, "pid": 2537909, "tid": 2537909, "ts": 6157602136716.707,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6157602136755.409, "dur": 1.376,
"args": {
"External id": 31, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 67, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 67, "pid": 0, "tid": 7, "ts": 6157602136755.409,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136739.451, "dur": 7.461,
"args": {
"External id": 31, "cbid": 307, "correlation": 67
}
},
{
"ph": "s", "id": 67, "pid": 2537909, "tid": 2537909, "ts": 6157602136739.451,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6157602136801.745, "dur": 109.024,
"args": {
"External id": 32, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 82, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 82, "pid": 0, "tid": 7, "ts": 6157602136801.745,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136794.474, "dur": 5.518,
"args": {
"External id": 32, "cbid": 307, "correlation": 82
}
},
{
"ph": "s", "id": 82, "pid": 2537909, "tid": 2537909, "ts": 6157602136794.474,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602136913.649, "dur": 0.800,
"args": {
"External id": 33, "device": 0, "context": 1, "stream": 7, "correlation": 97, "bytes": 4, "memory bandwidth (GB/s)": 0.005
}
},
{
"ph": "f", "id": 97, "pid": 0, "tid": 7, "ts": 6157602136913.649,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602136895.056, "dur": 13.740,
"args": {
"External id": 33, "cbid": 51, "correlation": 97
}
},
{
"ph": "s", "id": 97, "pid": 2537909, "tid": 2537909, "ts": 6157602136895.056,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7,
"ts": 6157602136917.937, "dur": 4324.740,
"args": {
"External id": 33, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 98, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 98, "pid": 0, "tid": 7, "ts": 6157602136917.937,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602136909.197, "dur": 7.251,
"args": {
"External id": 33, "cbid": 652, "correlation": 98
}
},
{
"ph": "s", "id": 98, "pid": 2537909, "tid": 2537909, "ts": 6157602136909.197,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7,
"ts": 6157602141243.893, "dur": 5738.277,
"args": {
"External id": 35, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 119, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 119, "pid": 0, "tid": 7, "ts": 6157602141243.893,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136968.727, "dur": 7.501,
"args": {
"External id": 35, "cbid": 307, "correlation": 119
}
},
{
"ph": "s", "id": 119, "pid": 2537909, "tid": 2537909, "ts": 6157602136968.727,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6157602146984.218, "dur": 2.720,
"args": {
"External id": 36, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42
}
},
{
"ph": "f", "id": 122, "pid": 0, "tid": 7, "ts": 6157602146984.218,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602136997.150, "dur": 5.207,
"args": {
"External id": 36, "cbid": 307, "correlation": 122
}
},
{
"ph": "s", "id": 122, "pid": 2537909, "tid": 2537909, "ts": 6157602136997.150,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602146988.314, "dur": 0.800,
"args": {
"External id": 37, "device": 0, "context": 1, "stream": 7, "correlation": 137, "bytes": 4, "memory bandwidth (GB/s)": 0.005
}
},
{
"ph": "f", "id": 137, "pid": 0, "tid": 7, "ts": 6157602146988.314,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602137029.298, "dur": 3.656,
"args": {
"External id": 37, "cbid": 51, "correlation": 137
}
},
{
"ph": "s", "id": 137, "pid": 2537909, "tid": 2537909, "ts": 6157602137029.298,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7,
"ts": 6157602146991.066, "dur": 3787.396,
"args": {
"External id": 37, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 138, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 138, "pid": 0, "tid": 7, "ts": 6157602146991.066,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602137033.124, "dur": 5.728,
"args": {
"External id": 37, "cbid": 652, "correlation": 138
}
},
{
"ph": "s", "id": 138, "pid": 2537909, "tid": 2537909, "ts": 6157602137033.124,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7,
"ts": 6157602150779.454, "dur": 1664.450,
"args": {
"External id": 38, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 149, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 149, "pid": 0, "tid": 7, "ts": 6157602150779.454,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602137068.327, "dur": 5.538,
"args": {
"External id": 38, "cbid": 307, "correlation": 149
}
},
{
"ph": "s", "id": 149, "pid": 2537909, "tid": 2537909, "ts": 6157602137068.327,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6157602152445.120, "dur": 74.944,
"args": {
"External id": 39, "device": 0, "context": 1, "stream": 7, "correlation": 156, "bytes": 77194752, "memory bandwidth (GB/s)": 1030.0324508966696
}
},
{
"ph": "f", "id": 156, "pid": 0, "tid": 7, "ts": 6157602152445.120,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602137108.678, "dur": 25.148,
"args": {
"External id": 39, "cbid": 41, "correlation": 156
}
},
{
"ph": "s", "id": 156, "pid": 2537909, "tid": 2537909, "ts": 6157602137108.678,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909,
"ts": 6157602137161.007, "dur": 2.614,
"args": {
"External id": 39, "cbid": 200, "correlation": 167
}
},
{
"ph": "f", "id": 167, "pid": 2537909, "tid": 2537909, "ts": 6157602137161.007,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6157602152521.120, "dur": 5831.621,
"args": {
"External id": 39, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 170, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 170, "pid": 0, "tid": 7, "ts": 6157602152521.120,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602137182.489, "dur": 5.418,
"args": {
"External id": 39, "cbid": 307, "correlation": 170
}
},
{
"ph": "s", "id": 170, "pid": 2537909, "tid": 2537909, "ts": 6157602137182.489,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7,
"ts": 6157602158353.957, "dur": 2.720,
"args": {
"External id": 40, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 182, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 182, "pid": 0, "tid": 7, "ts": 6157602158353.957,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602137229.139, "dur": 5.158,
"args": {
"External id": 40, "cbid": 307, "correlation": 182
}
},
{
"ph": "s", "id": 182, "pid": 2537909, "tid": 2537909, "ts": 6157602137229.139,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7,
"ts": 6157602158358.661, "dur": 1.888,
"args": {
"External id": 41, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 187, "pid": 0, "tid": 7, "ts": 6157602158358.661,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602137256.070, "dur": 4.136,
"args": {
"External id": 41, "cbid": 307, "correlation": 187
}
},
{
"ph": "s", "id": 187, "pid": 2537909, "tid": 2537909, "ts": 6157602137256.070,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6157602158361.765, "dur": 1.376,
"args": {
"External id": 45, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 198, "pid": 0, "tid": 7, "ts": 6157602158361.765,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602137433.678, "dur": 9.915,
"args": {
"External id": 45, "cbid": 211, "correlation": 198
}
},
{
"ph": "s", "id": 198, "pid": 2537909, "tid": 2537909, "ts": 6157602137433.678,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909,
"ts": 6157602137482.932, "dur": 2.784,
"args": {
"External id": 3, "cbid": 135, "correlation": 206
}
},
{
"ph": "f", "id": 206, "pid": 2537909, "tid": 2537909, "ts": 6157602137482.932,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6157602158364.325, "dur": 60.896,
"args": {
"External id": 515, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 215, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 215, "pid": 0, "tid": 7, "ts": 6157602158364.325,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602138140.284, "dur": 43.816,
"args": {
"External id": 515, "cbid": 307, "correlation": 215
}
},
{
"ph": "s", "id": 215, "pid": 2537909, "tid": 2544200, "ts": 6157602138140.284,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6157602158426.437, "dur": 2.304,
"args": {
"External id": 516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 219, "pid": 0, "tid": 7, "ts": 6157602158426.437,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602138210.099, "dur": 6.019,
"args": {
"External id": 516, "cbid": 307, "correlation": 219
}
},
{
"ph": "s", "id": 219, "pid": 2537909, "tid": 2544200, "ts": 6157602138210.099,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602158430.789, "dur": 47.840,
"args": {
"External id": 517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 223, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 223, "pid": 0, "tid": 7, "ts": 6157602158430.789,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602138254.396, "dur": 5.518,
"args": {
"External id": 517, "cbid": 307, "correlation": 223
}
},
{
"ph": "s", "id": 223, "pid": 2537909, "tid": 2544200, "ts": 6157602138254.396,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602138323.530, "dur": 3.035,
"args": {
"External id": 513, "cbid": 135, "correlation": 228
}
},
{
"ph": "f", "id": 228, "pid": 2537909, "tid": 2544200, "ts": 6157602138323.530,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602138332.423, "dur": 0.601,
"args": {
"External id": 513, "cbid": 135, "correlation": 233
}
},
{
"ph": "f", "id": 233, "pid": 2537909, "tid": 2544200, "ts": 6157602138332.423,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602138336.019, "dur": 0.360,
"args": {
"External id": 513, "cbid": 135, "correlation": 238
}
},
{
"ph": "f", "id": 238, "pid": 2537909, "tid": 2544200, "ts": 6157602138336.019,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6157602158480.709, "dur": 7.648,
"args": {
"External id": 51, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 299, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 299, "pid": 0, "tid": 7, "ts": 6157602158480.709,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138696.823, "dur": 12.559,
"args": {
"External id": 51, "cbid": 307, "correlation": 299
}
},
{
"ph": "s", "id": 299, "pid": 2537909, "tid": 2537909, "ts": 6157602138696.823,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6157602158489.541, "dur": 1.440,
"args": {
"External id": 52, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 307, "pid": 0, "tid": 7, "ts": 6157602158489.541,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138725.586, "dur": 4.107,
"args": {
"External id": 52, "cbid": 307, "correlation": 307
}
},
{
"ph": "s", "id": 307, "pid": 2537909, "tid": 2537909, "ts": 6157602138725.586,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602158491.909, "dur": 48.480,
"args": {
"External id": 53, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 315, "pid": 0, "tid": 7, "ts": 6157602158491.909,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138743.313, "dur": 3.385,
"args": {
"External id": 53, "cbid": 307, "correlation": 315
}
},
{
"ph": "s", "id": 315, "pid": 2537909, "tid": 2537909, "ts": 6157602138743.313,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6157602158542.470, "dur": 1.759,
"args": {
"External id": 74, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 322, "pid": 0, "tid": 7, "ts": 6157602158542.470,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138804.325, "dur": 4.998,
"args": {
"External id": 74, "cbid": 307, "correlation": 322
}
},
{
"ph": "s", "id": 322, "pid": 2537909, "tid": 2537909, "ts": 6157602138804.325,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6157602158545.189, "dur": 32.160,
"args": {
"External id": 75, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 329, "pid": 0, "tid": 7, "ts": 6157602158545.189,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138820.650, "dur": 2.974,
"args": {
"External id": 75, "cbid": 307, "correlation": 329
}
},
{
"ph": "s", "id": 329, "pid": 2537909, "tid": 2537909, "ts": 6157602138820.650,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6157602158578.277, "dur": 1.568,
"args": {
"External id": 76, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 336, "pid": 0, "tid": 7, "ts": 6157602158578.277,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138834.240, "dur": 2.664,
"args": {
"External id": 76, "cbid": 307, "correlation": 336
}
},
{
"ph": "s", "id": 336, "pid": 2537909, "tid": 2537909, "ts": 6157602138834.240,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6157602158580.805, "dur": 108.577,
"args": {
"External id": 77, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 351, "pid": 0, "tid": 7, "ts": 6157602158580.805,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138867.721, "dur": 4.046,
"args": {
"External id": 77, "cbid": 307, "correlation": 351
}
},
{
"ph": "s", "id": 351, "pid": 2537909, "tid": 2537909, "ts": 6157602138867.721,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602158691.014, "dur": 0.959,
"args": {
"External id": 78, "device": 0, "context": 1, "stream": 7, "correlation": 366, "bytes": 4, "memory bandwidth (GB/s)": 0.004171011470281543
}
},
{
"ph": "f", "id": 366, "pid": 0, "tid": 7, "ts": 6157602158691.014,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602138903.955, "dur": 5.008,
"args": {
"External id": 78, "cbid": 51, "correlation": 366
}
},
{
"ph": "s", "id": 366, "pid": 2537909, "tid": 2537909, "ts": 6157602138903.955,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7,
"ts": 6157602158694.533, "dur": 4341.189,
"args": {
"External id": 78, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 367, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 367, "pid": 0, "tid": 7, "ts": 6157602158694.533,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602138909.173, "dur": 5.038,
"args": {
"External id": 78, "cbid": 652, "correlation": 367
}
},
{
"ph": "s", "id": 367, "pid": 2537909, "tid": 2537909, "ts": 6157602138909.173,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7,
"ts": 6157602163037.834, "dur": 5767.205,
"args": {
"External id": 80, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 388, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 388, "pid": 0, "tid": 7, "ts": 6157602163037.834,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138947.511, "dur": 4.487,
"args": {
"External id": 80, "cbid": 307, "correlation": 388
}
},
{
"ph": "s", "id": 388, "pid": 2537909, "tid": 2537909, "ts": 6157602138947.511,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6157602168806.031, "dur": 2.720,
"args": {
"External id": 81, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42
}
},
{
"ph": "f", "id": 391, "pid": 0, "tid": 7, "ts": 6157602168806.031,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602138965.158, "dur": 2.994,
"args": {
"External id": 81, "cbid": 307, "correlation": 391
}
},
{
"ph": "s", "id": 391, "pid": 2537909, "tid": 2537909, "ts": 6157602138965.158,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602168810.543, "dur": 0.992,
"args": {
"External id": 82, "device": 0, "context": 1, "stream": 7, "correlation": 406, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129
}
},
{
"ph": "f", "id": 406, "pid": 0, "tid": 7, "ts": 6157602168810.543,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602138987.571, "dur": 3.185,
"args": {
"External id": 82, "cbid": 51, "correlation": 406
}
},
{
"ph": "s", "id": 406, "pid": 2537909, "tid": 2537909, "ts": 6157602138987.571,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7,
"ts": 6157602168813.007, "dur": 3761.860,
"args": {
"External id": 82, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 407, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 407, "pid": 0, "tid": 7, "ts": 6157602168813.007,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602138990.927, "dur": 3.996,
"args": {
"External id": 82, "cbid": 652, "correlation": 407
}
},
{
"ph": "s", "id": 407, "pid": 2537909, "tid": 2537909, "ts": 6157602138990.927,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7,
"ts": 6157602172576.915, "dur": 1672.481,
"args": {
"External id": 83, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 418, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 418, "pid": 0, "tid": 7, "ts": 6157602172576.915,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139013.420, "dur": 3.265,
"args": {
"External id": 83, "cbid": 307, "correlation": 418
}
},
{
"ph": "s", "id": 418, "pid": 2537909, "tid": 2537909, "ts": 6157602139013.420,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6157602174251.316, "dur": 75.648,
"args": {
"External id": 84, "device": 0, "context": 1, "stream": 7, "correlation": 425, "bytes": 77194752, "memory bandwidth (GB/s)": 1020.4467005076142
}
},
{
"ph": "f", "id": 425, "pid": 0, "tid": 7, "ts": 6157602174251.316,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602139031.107, "dur": 11.618,
"args": {
"External id": 84, "cbid": 41, "correlation": 425
}
},
{
"ph": "s", "id": 425, "pid": 2537909, "tid": 2537909, "ts": 6157602139031.107,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909,
"ts": 6157602139052.840, "dur": 0.731,
"args": {
"External id": 84, "cbid": 200, "correlation": 436
}
},
{
"ph": "f", "id": 436, "pid": 2537909, "tid": 2537909, "ts": 6157602139052.840,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6157602174328.020, "dur": 5794.694,
"args": {
"External id": 84, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 439, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 439, "pid": 0, "tid": 7, "ts": 6157602174328.020,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139054.863, "dur": 3.255,
"args": {
"External id": 84, "cbid": 307, "correlation": 439
}
},
{
"ph": "s", "id": 439, "pid": 2537909, "tid": 2537909, "ts": 6157602139054.863,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7,
"ts": 6157602180124.890, "dur": 2.848,
"args": {
"External id": 85, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 451, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 451, "pid": 0, "tid": 7, "ts": 6157602180124.890,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139084.618, "dur": 3.685,
"args": {
"External id": 85, "cbid": 307, "correlation": 451
}
},
{
"ph": "s", "id": 451, "pid": 2537909, "tid": 2537909, "ts": 6157602139084.618,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7,
"ts": 6157602180128.794, "dur": 1.792,
"args": {
"External id": 86, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 456, "pid": 0, "tid": 7, "ts": 6157602180128.794,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139099.831, "dur": 2.854,
"args": {
"External id": 86, "cbid": 307, "correlation": 456
}
},
{
"ph": "s", "id": 456, "pid": 2537909, "tid": 2537909, "ts": 6157602139099.831,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6157602180131.578, "dur": 1.504,
"args": {
"External id": 90, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 467, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 467, "pid": 0, "tid": 7, "ts": 6157602180131.578,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139173.402, "dur": 5.869,
"args": {
"External id": 90, "cbid": 211, "correlation": 467
}
},
{
"ph": "s", "id": 467, "pid": 2537909, "tid": 2537909, "ts": 6157602139173.402,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909,
"ts": 6157602139196.306, "dur": 1.112,
"args": {
"External id": 48, "cbid": 135, "correlation": 475
}
},
{
"ph": "f", "id": 475, "pid": 2537909, "tid": 2537909, "ts": 6157602139196.306,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6157602180134.266, "dur": 60.928,
"args": {
"External id": 532, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 484, "pid": 0, "tid": 7, "ts": 6157602180134.266,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602139332.101, "dur": 9.865,
"args": {
"External id": 532, "cbid": 307, "correlation": 484
}
},
{
"ph": "s", "id": 484, "pid": 2537909, "tid": 2544200, "ts": 6157602139332.101,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6157602180197.210, "dur": 2.432,
"args": {
"External id": 533, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 488, "pid": 0, "tid": 7, "ts": 6157602180197.210,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602139354.755, "dur": 3.496,
"args": {
"External id": 533, "cbid": 307, "correlation": 488
}
},
{
"ph": "s", "id": 488, "pid": 2537909, "tid": 2544200, "ts": 6157602139354.755,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602180200.858, "dur": 47.776,
"args": {
"External id": 534, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 492, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 492, "pid": 0, "tid": 7, "ts": 6157602180200.858,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602139372.262, "dur": 3.685,
"args": {
"External id": 534, "cbid": 307, "correlation": 492
}
},
{
"ph": "s", "id": 492, "pid": 2537909, "tid": 2544200, "ts": 6157602139372.262,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602139395.547, "dur": 1.011,
"args": {
"External id": 530, "cbid": 135, "correlation": 497
}
},
{
"ph": "f", "id": 497, "pid": 2537909, "tid": 2544200, "ts": 6157602139395.547,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602139399.743, "dur": 0.391,
"args": {
"External id": 530, "cbid": 135, "correlation": 502
}
},
{
"ph": "f", "id": 502, "pid": 2537909, "tid": 2544200, "ts": 6157602139399.743,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602139402.627, "dur": 0.341,
"args": {
"External id": 530, "cbid": 135, "correlation": 507
}
},
{
"ph": "f", "id": 507, "pid": 2537909, "tid": 2544200, "ts": 6157602139402.627,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6157602180249.850, "dur": 8.512,
"args": {
"External id": 96, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 568, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 568, "pid": 0, "tid": 7, "ts": 6157602180249.850,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139646.766, "dur": 11.156,
"args": {
"External id": 96, "cbid": 307, "correlation": 568
}
},
{
"ph": "s", "id": 568, "pid": 2537909, "tid": 2537909, "ts": 6157602139646.766,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6157602180259.450, "dur": 1.920,
"args": {
"External id": 97, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 576, "pid": 0, "tid": 7, "ts": 6157602180259.450,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139671.313, "dur": 4.977,
"args": {
"External id": 97, "cbid": 307, "correlation": 576
}
},
{
"ph": "s", "id": 576, "pid": 2537909, "tid": 2537909, "ts": 6157602139671.313,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602180263.514, "dur": 48.320,
"args": {
"External id": 98, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 584, "pid": 0, "tid": 7, "ts": 6157602180263.514,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139689.110, "dur": 3.345,
"args": {
"External id": 98, "cbid": 307, "correlation": 584
}
},
{
"ph": "s", "id": 584, "pid": 2537909, "tid": 2537909, "ts": 6157602139689.110,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6157602180314.010, "dur": 1.632,
"args": {
"External id": 119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 591, "pid": 0, "tid": 7, "ts": 6157602180314.010,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139748.719, "dur": 4.407,
"args": {
"External id": 119, "cbid": 307, "correlation": 591
}
},
{
"ph": "s", "id": 591, "pid": 2537909, "tid": 2537909, "ts": 6157602139748.719,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6157602180316.634, "dur": 31.776,
"args": {
"External id": 120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 598, "pid": 0, "tid": 7, "ts": 6157602180316.634,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139763.712, "dur": 3.425,
"args": {
"External id": 120, "cbid": 307, "correlation": 598
}
},
{
"ph": "s", "id": 598, "pid": 2537909, "tid": 2537909, "ts": 6157602139763.712,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6157602180349.658, "dur": 1.792,
"args": {
"External id": 121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 605, "pid": 0, "tid": 7, "ts": 6157602180349.658,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139777.974, "dur": 2.954,
"args": {
"External id": 121, "cbid": 307, "correlation": 605
}
},
{
"ph": "s", "id": 605, "pid": 2537909, "tid": 2537909, "ts": 6157602139777.974,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6157602180352.474, "dur": 108.384,
"args": {
"External id": 122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 620, "pid": 0, "tid": 7, "ts": 6157602180352.474,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139810.583, "dur": 3.345,
"args": {
"External id": 122, "cbid": 307, "correlation": 620
}
},
{
"ph": "s", "id": 620, "pid": 2537909, "tid": 2537909, "ts": 6157602139810.583,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602180462.362, "dur": 1.184,
"args": {
"External id": 123, "device": 0, "context": 1, "stream": 7, "correlation": 635, "bytes": 4, "memory bandwidth (GB/s)": 0.0033783783783783786
}
},
{
"ph": "f", "id": 635, "pid": 0, "tid": 7, "ts": 6157602180462.362,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602139841.339, "dur": 4.547,
"args": {
"External id": 123, "cbid": 51, "correlation": 635
}
},
{
"ph": "s", "id": 635, "pid": 2537909, "tid": 2537909, "ts": 6157602139841.339,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7,
"ts": 6157602180465.402, "dur": 4310.468,
"args": {
"External id": 123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 636, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 636, "pid": 0, "tid": 7, "ts": 6157602180465.402,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602139846.126, "dur": 4.868,
"args": {
"External id": 123, "cbid": 652, "correlation": 636
}
},
{
"ph": "s", "id": 636, "pid": 2537909, "tid": 2537909, "ts": 6157602139846.126,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7,
"ts": 6157602184776.926, "dur": 7076.519,
"args": {
"External id": 125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 657, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 657, "pid": 0, "tid": 7, "ts": 6157602184776.926,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139883.112, "dur": 4.267,
"args": {
"External id": 125, "cbid": 307, "correlation": 657
}
},
{
"ph": "s", "id": 657, "pid": 2537909, "tid": 2537909, "ts": 6157602139883.112,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6157602191854.853, "dur": 2.752,
"args": {
"External id": 126, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42
}
},
{
"ph": "f", "id": 660, "pid": 0, "tid": 7, "ts": 6157602191854.853,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139899.978, "dur": 3.234,
"args": {
"External id": 126, "cbid": 307, "correlation": 660
}
},
{
"ph": "s", "id": 660, "pid": 2537909, "tid": 2537909, "ts": 6157602139899.978,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602191859.333, "dur": 0.992,
"args": {
"External id": 127, "device": 0, "context": 1, "stream": 7, "correlation": 675, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129
}
},
{
"ph": "f", "id": 675, "pid": 0, "tid": 7, "ts": 6157602191859.333,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602139921.139, "dur": 3.195,
"args": {
"External id": 127, "cbid": 51, "correlation": 675
}
},
{
"ph": "s", "id": 675, "pid": 2537909, "tid": 2537909, "ts": 6157602139921.139,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7,
"ts": 6157602191863.045, "dur": 3698.628,
"args": {
"External id": 127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 676, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 676, "pid": 0, "tid": 7, "ts": 6157602191863.045,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602139924.485, "dur": 3.805,
"args": {
"External id": 127, "cbid": 652, "correlation": 676
}
},
{
"ph": "s", "id": 676, "pid": 2537909, "tid": 2537909, "ts": 6157602139924.485,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7,
"ts": 6157602195563.816, "dur": 1675.202,
"args": {
"External id": 128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 687, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 687, "pid": 0, "tid": 7, "ts": 6157602195563.816,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139946.428, "dur": 3.775,
"args": {
"External id": 128, "cbid": 307, "correlation": 687
}
},
{
"ph": "s", "id": 687, "pid": 2537909, "tid": 2537909, "ts": 6157602139946.428,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6157602197240.074, "dur": 75.712,
"args": {
"External id": 129, "device": 0, "context": 1, "stream": 7, "correlation": 694, "bytes": 77194752, "memory bandwidth (GB/s)": 1019.5841081994928
}
},
{
"ph": "f", "id": 694, "pid": 0, "tid": 7, "ts": 6157602197240.074,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602139963.604, "dur": 10.826,
"args": {
"External id": 129, "cbid": 41, "correlation": 694
}
},
{
"ph": "s", "id": 694, "pid": 2537909, "tid": 2537909, "ts": 6157602139963.604,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909,
"ts": 6157602139984.475, "dur": 0.491,
"args": {
"External id": 129, "cbid": 200, "correlation": 705
}
},
{
"ph": "f", "id": 705, "pid": 2537909, "tid": 2537909, "ts": 6157602139984.475,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6157602197317.994, "dur": 5983.974,
"args": {
"External id": 129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 708, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 708, "pid": 0, "tid": 7, "ts": 6157602197317.994,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602139986.348, "dur": 3.155,
"args": {
"External id": 129, "cbid": 307, "correlation": 708
}
},
{
"ph": "s", "id": 708, "pid": 2537909, "tid": 2537909, "ts": 6157602139986.348,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7,
"ts": 6157602203304.208, "dur": 2.976,
"args": {
"External id": 130, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 720, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 720, "pid": 0, "tid": 7, "ts": 6157602203304.208,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140016.283, "dur": 3.345,
"args": {
"External id": 130, "cbid": 307, "correlation": 720
}
},
{
"ph": "s", "id": 720, "pid": 2537909, "tid": 2537909, "ts": 6157602140016.283,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7,
"ts": 6157602203308.208, "dur": 1.920,
"args": {
"External id": 131, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 725, "pid": 0, "tid": 7, "ts": 6157602203308.208,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140030.364, "dur": 2.995,
"args": {
"External id": 131, "cbid": 307, "correlation": 725
}
},
{
"ph": "s", "id": 725, "pid": 2537909, "tid": 2537909, "ts": 6157602140030.364,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6157602203311.280, "dur": 1.792,
"args": {
"External id": 135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 736, "pid": 0, "tid": 7, "ts": 6157602203311.280,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140099.168, "dur": 5.879,
"args": {
"External id": 135, "cbid": 211, "correlation": 736
}
},
{
"ph": "s", "id": 736, "pid": 2537909, "tid": 2537909, "ts": 6157602140099.168,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909,
"ts": 6157602140121.161, "dur": 1.132,
"args": {
"External id": 93, "cbid": 135, "correlation": 744
}
},
{
"ph": "f", "id": 744, "pid": 2537909, "tid": 2537909, "ts": 6157602140121.161,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6157602203314.480, "dur": 61.280,
"args": {
"External id": 549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 753, "pid": 0, "tid": 7, "ts": 6157602203314.480,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602140240.000, "dur": 9.655,
"args": {
"External id": 549, "cbid": 307, "correlation": 753
}
},
{
"ph": "s", "id": 753, "pid": 2537909, "tid": 2544200, "ts": 6157602140240.000,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6157602203378.032, "dur": 2.496,
"args": {
"External id": 550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 757, "pid": 0, "tid": 7, "ts": 6157602203378.032,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602140259.970, "dur": 2.935,
"args": {
"External id": 550, "cbid": 307, "correlation": 757
}
},
{
"ph": "s", "id": 757, "pid": 2537909, "tid": 2544200, "ts": 6157602140259.970,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602203381.872, "dur": 48.128,
"args": {
"External id": 551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 761, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 761, "pid": 0, "tid": 7, "ts": 6157602203381.872,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602140275.905, "dur": 2.503,
"args": {
"External id": 551, "cbid": 307, "correlation": 761
}
},
{
"ph": "s", "id": 761, "pid": 2537909, "tid": 2544200, "ts": 6157602140275.905,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602140312.069, "dur": 0.881,
"args": {
"External id": 547, "cbid": 135, "correlation": 766
}
},
{
"ph": "f", "id": 766, "pid": 2537909, "tid": 2544200, "ts": 6157602140312.069,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602140316.826, "dur": 0.581,
"args": {
"External id": 547, "cbid": 135, "correlation": 771
}
},
{
"ph": "f", "id": 771, "pid": 2537909, "tid": 2544200, "ts": 6157602140316.826,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602140319.891, "dur": 0.340,
"args": {
"External id": 547, "cbid": 135, "correlation": 776
}
},
{
"ph": "f", "id": 776, "pid": 2537909, "tid": 2544200, "ts": 6157602140319.891,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6157602203431.088, "dur": 8.832,
"args": {
"External id": 141, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 837, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 837, "pid": 0, "tid": 7, "ts": 6157602203431.088,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140556.227, "dur": 12.018,
"args": {
"External id": 141, "cbid": 307, "correlation": 837
}
},
{
"ph": "s", "id": 837, "pid": 2537909, "tid": 2537909, "ts": 6157602140556.227,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6157602203442.160, "dur": 1.920,
"args": {
"External id": 142, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 845, "pid": 0, "tid": 7, "ts": 6157602203442.160,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140582.988, "dur": 4.196,
"args": {
"External id": 142, "cbid": 307, "correlation": 845
}
},
{
"ph": "s", "id": 845, "pid": 2537909, "tid": 2537909, "ts": 6157602140582.988,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602203445.168, "dur": 48.704,
"args": {
"External id": 143, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 853, "pid": 0, "tid": 7, "ts": 6157602203445.168,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140599.773, "dur": 3.435,
"args": {
"External id": 143, "cbid": 307, "correlation": 853
}
},
{
"ph": "s", "id": 853, "pid": 2537909, "tid": 2537909, "ts": 6157602140599.773,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6157602203494.960, "dur": 1.696,
"args": {
"External id": 164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 860, "pid": 0, "tid": 7, "ts": 6157602203494.960,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140657.470, "dur": 4.016,
"args": {
"External id": 164, "cbid": 307, "correlation": 860
}
},
{
"ph": "s", "id": 860, "pid": 2537909, "tid": 2537909, "ts": 6157602140657.470,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6157602203497.808, "dur": 32.192,
"args": {
"External id": 165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 867, "pid": 0, "tid": 7, "ts": 6157602203497.808,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140672.212, "dur": 2.604,
"args": {
"External id": 165, "cbid": 307, "correlation": 867
}
},
{
"ph": "s", "id": 867, "pid": 2537909, "tid": 2537909, "ts": 6157602140672.212,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6157602203532.304, "dur": 1.792,
"args": {
"External id": 166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 874, "pid": 0, "tid": 7, "ts": 6157602203532.304,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140685.292, "dur": 3.205,
"args": {
"External id": 166, "cbid": 307, "correlation": 874
}
},
{
"ph": "s", "id": 874, "pid": 2537909, "tid": 2537909, "ts": 6157602140685.292,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6157602203535.408, "dur": 108.128,
"args": {
"External id": 167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 889, "pid": 0, "tid": 7, "ts": 6157602203535.408,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140718.222, "dur": 3.034,
"args": {
"External id": 167, "cbid": 307, "correlation": 889
}
},
{
"ph": "s", "id": 889, "pid": 2537909, "tid": 2537909, "ts": 6157602140718.222,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602203645.104, "dur": 0.896,
"args": {
"External id": 168, "device": 0, "context": 1, "stream": 7, "correlation": 904, "bytes": 4, "memory bandwidth (GB/s)": 0.004464285714285714
}
},
{
"ph": "f", "id": 904, "pid": 0, "tid": 7, "ts": 6157602203645.104,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602140750.891, "dur": 4.887,
"args": {
"External id": 168, "cbid": 51, "correlation": 904
}
},
{
"ph": "s", "id": 904, "pid": 2537909, "tid": 2537909, "ts": 6157602140750.891,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7,
"ts": 6157602203647.664, "dur": 4371.940,
"args": {
"External id": 168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 905, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 905, "pid": 0, "tid": 7, "ts": 6157602203647.664,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602140755.999, "dur": 4.717,
"args": {
"External id": 168, "cbid": 652, "correlation": 905
}
},
{
"ph": "s", "id": 905, "pid": 2537909, "tid": 2537909, "ts": 6157602140755.999,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7,
"ts": 6157602208020.980, "dur": 7215.591,
"args": {
"External id": 170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 926, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 926, "pid": 0, "tid": 7, "ts": 6157602208020.980,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140792.363, "dur": 4.337,
"args": {
"External id": 170, "cbid": 307, "correlation": 926
}
},
{
"ph": "s", "id": 926, "pid": 2537909, "tid": 2537909, "ts": 6157602140792.363,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6157602215237.947, "dur": 2.848,
"args": {
"External id": 171, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42
}
},
{
"ph": "f", "id": 929, "pid": 0, "tid": 7, "ts": 6157602215237.947,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140807.446, "dur": 2.804,
"args": {
"External id": 171, "cbid": 307, "correlation": 929
}
},
{
"ph": "s", "id": 929, "pid": 2537909, "tid": 2537909, "ts": 6157602140807.446,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602215242.235, "dur": 1.184,
"args": {
"External id": 172, "device": 0, "context": 1, "stream": 7, "correlation": 944, "bytes": 4, "memory bandwidth (GB/s)": 0.0033783783783783786
}
},
{
"ph": "f", "id": 944, "pid": 0, "tid": 7, "ts": 6157602215242.235,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602140828.949, "dur": 3.104,
"args": {
"External id": 172, "cbid": 51, "correlation": 944
}
},
{
"ph": "s", "id": 944, "pid": 2537909, "tid": 2537909, "ts": 6157602140828.949,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7,
"ts": 6157602215245.531, "dur": 3707.620,
"args": {
"External id": 172, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 945, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 945, "pid": 0, "tid": 7, "ts": 6157602215245.531,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602140832.214, "dur": 3.595,
"args": {
"External id": 172, "cbid": 652, "correlation": 945
}
},
{
"ph": "s", "id": 945, "pid": 2537909, "tid": 2537909, "ts": 6157602140832.214,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7,
"ts": 6157602218954.526, "dur": 1679.490,
"args": {
"External id": 173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 956, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 956, "pid": 0, "tid": 7, "ts": 6157602218954.526,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140853.225, "dur": 3.015,
"args": {
"External id": 173, "cbid": 307, "correlation": 956
}
},
{
"ph": "s", "id": 956, "pid": 2537909, "tid": 2537909, "ts": 6157602140853.225,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6157602220635.200, "dur": 77.344,
"args": {
"External id": 174, "device": 0, "context": 1, "stream": 7, "correlation": 963, "bytes": 77194752, "memory bandwidth (GB/s)": 998.0703351261895
}
},
{
"ph": "f", "id": 963, "pid": 0, "tid": 7, "ts": 6157602220635.200,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602140869.620, "dur": 11.187,
"args": {
"External id": 174, "cbid": 41, "correlation": 963
}
},
{
"ph": "s", "id": 963, "pid": 2537909, "tid": 2537909, "ts": 6157602140869.620,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909,
"ts": 6157602140890.501, "dur": 0.461,
"args": {
"External id": 174, "cbid": 200, "correlation": 974
}
},
{
"ph": "f", "id": 974, "pid": 2537909, "tid": 2537909, "ts": 6157602140890.501,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6157602220713.632, "dur": 6128.294,
"args": {
"External id": 174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 977, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 977, "pid": 0, "tid": 7, "ts": 6157602220713.632,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140892.184, "dur": 3.335,
"args": {
"External id": 174, "cbid": 307, "correlation": 977
}
},
{
"ph": "s", "id": 977, "pid": 2537909, "tid": 2537909, "ts": 6157602140892.184,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7,
"ts": 6157602226843.302, "dur": 2.880,
"args": {
"External id": 175, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 989, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 989, "pid": 0, "tid": 7, "ts": 6157602226843.302,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140921.148, "dur": 3.916,
"args": {
"External id": 175, "cbid": 307, "correlation": 989
}
},
{
"ph": "s", "id": 989, "pid": 2537909, "tid": 2537909, "ts": 6157602140921.148,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7,
"ts": 6157602226847.270, "dur": 2.304,
"args": {
"External id": 176, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 994, "pid": 0, "tid": 7, "ts": 6157602226847.270,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602140936.631, "dur": 2.874,
"args": {
"External id": 176, "cbid": 307, "correlation": 994
}
},
{
"ph": "s", "id": 994, "pid": 2537909, "tid": 2537909, "ts": 6157602140936.631,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6157602226851.686, "dur": 1.856,
"args": {
"External id": 180, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1005, "pid": 0, "tid": 7, "ts": 6157602226851.686,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141006.136, "dur": 6.179,
"args": {
"External id": 180, "cbid": 211, "correlation": 1005
}
},
{
"ph": "s", "id": 1005, "pid": 2537909, "tid": 2537909, "ts": 6157602141006.136,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909,
"ts": 6157602141028.950, "dur": 1.112,
"args": {
"External id": 138, "cbid": 135, "correlation": 1013
}
},
{
"ph": "f", "id": 1013, "pid": 2537909, "tid": 2537909, "ts": 6157602141028.950,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6157602226854.886, "dur": 61.344,
"args": {
"External id": 566, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1022, "pid": 0, "tid": 7, "ts": 6157602226854.886,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602141147.840, "dur": 8.873,
"args": {
"External id": 566, "cbid": 307, "correlation": 1022
}
},
{
"ph": "s", "id": 1022, "pid": 2537909, "tid": 2544200, "ts": 6157602141147.840,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6157602226917.318, "dur": 2.560,
"args": {
"External id": 567, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 1026, "pid": 0, "tid": 7, "ts": 6157602226917.318,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602141167.900, "dur": 2.914,
"args": {
"External id": 567, "cbid": 307, "correlation": 1026
}
},
{
"ph": "s", "id": 1026, "pid": 2537909, "tid": 2544200, "ts": 6157602141167.900,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602226921.286, "dur": 47.936,
"args": {
"External id": 568, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1030, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1030, "pid": 0, "tid": 7, "ts": 6157602226921.286,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602141184.275, "dur": 2.664,
"args": {
"External id": 568, "cbid": 307, "correlation": 1030
}
},
{
"ph": "s", "id": 1030, "pid": 2537909, "tid": 2544200, "ts": 6157602141184.275,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602141205.527, "dur": 1.121,
"args": {
"External id": 564, "cbid": 135, "correlation": 1035
}
},
{
"ph": "f", "id": 1035, "pid": 2537909, "tid": 2544200, "ts": 6157602141205.527,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602141208.832, "dur": 0.390,
"args": {
"External id": 564, "cbid": 135, "correlation": 1040
}
},
{
"ph": "f", "id": 1040, "pid": 2537909, "tid": 2544200, "ts": 6157602141208.832,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602141211.566, "dur": 0.330,
"args": {
"External id": 564, "cbid": 135, "correlation": 1045
}
},
{
"ph": "f", "id": 1045, "pid": 2537909, "tid": 2544200, "ts": 6157602141211.566,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6157602226971.430, "dur": 9.376,
"args": {
"External id": 186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1106, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1106, "pid": 0, "tid": 7, "ts": 6157602226971.430,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141454.782, "dur": 11.698,
"args": {
"External id": 186, "cbid": 307, "correlation": 1106
}
},
{
"ph": "s", "id": 1106, "pid": 2537909, "tid": 2537909, "ts": 6157602141454.782,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6157602226982.182, "dur": 1.728,
"args": {
"External id": 187, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1114, "pid": 0, "tid": 7, "ts": 6157602226982.182,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141481.162, "dur": 3.866,
"args": {
"External id": 187, "cbid": 307, "correlation": 1114
}
},
{
"ph": "s", "id": 1114, "pid": 2537909, "tid": 2537909, "ts": 6157602141481.162,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602226985.254, "dur": 48.608,
"args": {
"External id": 188, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1122, "pid": 0, "tid": 7, "ts": 6157602226985.254,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141498.779, "dur": 3.014,
"args": {
"External id": 188, "cbid": 307, "correlation": 1122
}
},
{
"ph": "s", "id": 1122, "pid": 2537909, "tid": 2537909, "ts": 6157602141498.779,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6157602227035.206, "dur": 1.920,
"args": {
"External id": 209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1129, "pid": 0, "tid": 7, "ts": 6157602227035.206,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141557.127, "dur": 4.747,
"args": {
"External id": 209, "cbid": 307, "correlation": 1129
}
},
{
"ph": "s", "id": 1129, "pid": 2537909, "tid": 2537909, "ts": 6157602141557.127,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6157602227039.238, "dur": 32.096,
"args": {
"External id": 210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1136, "pid": 0, "tid": 7, "ts": 6157602227039.238,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141573.461, "dur": 3.756,
"args": {
"External id": 210, "cbid": 307, "correlation": 1136
}
},
{
"ph": "s", "id": 1136, "pid": 2537909, "tid": 2537909, "ts": 6157602141573.461,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6157602227073.606, "dur": 1.760,
"args": {
"External id": 211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 1143, "pid": 0, "tid": 7, "ts": 6157602227073.606,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141586.631, "dur": 2.604,
"args": {
"External id": 211, "cbid": 307, "correlation": 1143
}
},
{
"ph": "s", "id": 1143, "pid": 2537909, "tid": 2537909, "ts": 6157602141586.631,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6157602227076.390, "dur": 108.960,
"args": {
"External id": 212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1158, "pid": 0, "tid": 7, "ts": 6157602227076.390,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141619.090, "dur": 3.355,
"args": {
"External id": 212, "cbid": 307, "correlation": 1158
}
},
{
"ph": "s", "id": 1158, "pid": 2537909, "tid": 2537909, "ts": 6157602141619.090,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602227187.270, "dur": 1.120,
"args": {
"External id": 213, "device": 0, "context": 1, "stream": 7, "correlation": 1173, "bytes": 4, "memory bandwidth (GB/s)": 0.0035714285714285713
}
},
{
"ph": "f", "id": 1173, "pid": 0, "tid": 7, "ts": 6157602227187.270,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602141649.426, "dur": 4.577,
"args": {
"External id": 213, "cbid": 51, "correlation": 1173
}
},
{
"ph": "s", "id": 1173, "pid": 2537909, "tid": 2537909, "ts": 6157602141649.426,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7,
"ts": 6157602227190.086, "dur": 4370.341,
"args": {
"External id": 213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1174, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1174, "pid": 0, "tid": 7, "ts": 6157602227190.086,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602141654.243, "dur": 4.607,
"args": {
"External id": 213, "cbid": 652, "correlation": 1174
}
},
{
"ph": "s", "id": 1174, "pid": 2537909, "tid": 2537909, "ts": 6157602141654.243,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7,
"ts": 6157602231562.763, "dur": 7216.518,
"args": {
"External id": 215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1195, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 1195, "pid": 0, "tid": 7, "ts": 6157602231562.763,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141691.439, "dur": 4.998,
"args": {
"External id": 215, "cbid": 307, "correlation": 1195
}
},
{
"ph": "s", "id": 1195, "pid": 2537909, "tid": 2537909, "ts": 6157602141691.439,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6157602238781.425, "dur": 2.880,
"args": {
"External id": 216, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42
}
},
{
"ph": "f", "id": 1198, "pid": 0, "tid": 7, "ts": 6157602238781.425,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141707.954, "dur": 3.035,
"args": {
"External id": 216, "cbid": 307, "correlation": 1198
}
},
{
"ph": "s", "id": 1198, "pid": 2537909, "tid": 2537909, "ts": 6157602141707.954,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7,
"ts": 6157602238786.161, "dur": 0.896,
"args": {
"External id": 217, "device": 0, "context": 1, "stream": 7, "correlation": 1213, "bytes": 4, "memory bandwidth (GB/s)": 0.004464285714285714
}
},
{
"ph": "f", "id": 1213, "pid": 0, "tid": 7, "ts": 6157602238786.161,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602141729.467, "dur": 2.964,
"args": {
"External id": 217, "cbid": 51, "correlation": 1213
}
},
{
"ph": "s", "id": 1213, "pid": 2537909, "tid": 2537909, "ts": 6157602141729.467,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7,
"ts": 6157602238789.233, "dur": 3724.740,
"args": {
"External id": 217, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1214, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1214, "pid": 0, "tid": 7, "ts": 6157602238789.233,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909,
"ts": 6157602141732.581, "dur": 3.876,
"args": {
"External id": 217, "cbid": 652, "correlation": 1214
}
},
{
"ph": "s", "id": 1214, "pid": 2537909, "tid": 2537909, "ts": 6157602141732.581,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7,
"ts": 6157602242514.997, "dur": 1677.441,
"args": {
"External id": 218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1225, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 1225, "pid": 0, "tid": 7, "ts": 6157602242514.997,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141753.703, "dur": 3.075,
"args": {
"External id": 218, "cbid": 307, "correlation": 1225
}
},
{
"ph": "s", "id": 1225, "pid": 2537909, "tid": 2537909, "ts": 6157602141753.703,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6157602244194.678, "dur": 77.313,
"args": {
"External id": 219, "device": 0, "context": 1, "stream": 7, "correlation": 1232, "bytes": 77194752, "memory bandwidth (GB/s)": 998.4705288890614
}
},
{
"ph": "f", "id": 1232, "pid": 0, "tid": 7, "ts": 6157602244194.678,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909,
"ts": 6157602141769.998, "dur": 10.736,
"args": {
"External id": 219, "cbid": 41, "correlation": 1232
}
},
{
"ph": "s", "id": 1232, "pid": 2537909, "tid": 2537909, "ts": 6157602141769.998,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909,
"ts": 6157602141790.719, "dur": 0.461,
"args": {
"External id": 219, "cbid": 200, "correlation": 1243
}
},
{
"ph": "f", "id": 1243, "pid": 2537909, "tid": 2537909, "ts": 6157602141790.719,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6157602244273.143, "dur": 6101.189,
"args": {
"External id": 219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1246, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1246, "pid": 0, "tid": 7, "ts": 6157602244273.143,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141792.382, "dur": 3.054,
"args": {
"External id": 219, "cbid": 307, "correlation": 1246
}
},
{
"ph": "s", "id": 1246, "pid": 2537909, "tid": 2537909, "ts": 6157602141792.382,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7,
"ts": 6157602250375.484, "dur": 3.136,
"args": {
"External id": 220, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1258, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1258, "pid": 0, "tid": 7, "ts": 6157602250375.484,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141822.637, "dur": 3.435,
"args": {
"External id": 220, "cbid": 307, "correlation": 1258
}
},
{
"ph": "s", "id": 1258, "pid": 2537909, "tid": 2537909, "ts": 6157602141822.637,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7,
"ts": 6157602250380.860, "dur": 2.144,
"args": {
"External id": 221, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1263, "pid": 0, "tid": 7, "ts": 6157602250380.860,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141838.151, "dur": 2.794,
"args": {
"External id": 221, "cbid": 307, "correlation": 1263
}
},
{
"ph": "s", "id": 1263, "pid": 2537909, "tid": 2537909, "ts": 6157602141838.151,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6157602250384.188, "dur": 1.632,
"args": {
"External id": 225, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1274, "pid": 0, "tid": 7, "ts": 6157602250384.188,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909,
"ts": 6157602141910.049, "dur": 5.619,
"args": {
"External id": 225, "cbid": 211, "correlation": 1274
}
},
{
"ph": "s", "id": 1274, "pid": 2537909, "tid": 2537909, "ts": 6157602141910.049,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909,
"ts": 6157602141932.203, "dur": 1.081,
"args": {
"External id": 183, "cbid": 135, "correlation": 1282
}
},
{
"ph": "f", "id": 1282, "pid": 2537909, "tid": 2537909, "ts": 6157602141932.203,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6157602250387.164, "dur": 61.504,
"args": {
"External id": 583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1291, "pid": 0, "tid": 7, "ts": 6157602250387.164,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602142051.733, "dur": 8.933,
"args": {
"External id": 583, "cbid": 307, "correlation": 1291
}
},
{
"ph": "s", "id": 1291, "pid": 2537909, "tid": 2544200, "ts": 6157602142051.733,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6157602250449.724, "dur": 2.560,
"args": {
"External id": 584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1295, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 1295, "pid": 0, "tid": 7, "ts": 6157602250449.724,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602142071.613, "dur": 3.064,
"args": {
"External id": 584, "cbid": 307, "correlation": 1295
}
},
{
"ph": "s", "id": 1295, "pid": 2537909, "tid": 2544200, "ts": 6157602142071.613,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6157602250453.340, "dur": 48.449,
"args": {
"External id": 585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1299, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1299, "pid": 0, "tid": 7, "ts": 6157602250453.340,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200,
"ts": 6157602142087.627, "dur": 2.864,
"args": {
"External id": 585, "cbid": 307, "correlation": 1299
}
},
{
"ph": "s", "id": 1299, "pid": 2537909, "tid": 2544200, "ts": 6157602142087.627,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602142107.988, "dur": 0.841,
"args": {
"External id": 581, "cbid": 135, "correlation": 1304
}
},
{
"ph": "f", "id": 1304, "pid": 2537909, "tid": 2544200, "ts": 6157602142107.988,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602142111.503, "dur": 0.551,
"args": {
"External id": 581, "cbid": 135, "correlation": 1309
}
},
{
"ph": "f", "id": 1309, "pid": 2537909, "tid": 2544200, "ts": 6157602142111.503,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200,
"ts": 6157602142114.377, "dur": 0.341,
"args": {
"External id": 581, "cbid": 135, "correlation": 1314
}
},
{
"ph": "f", "id": 1314, "pid": 2537909, "tid": 2544200, "ts": 6157602142114.377,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 2537909, "tid": 2537909,
"ts": 6157602142265.716, "dur": 108250.600,
"args": {
"cbid": 165, "correlation": 1348
}
},
{
"ph": "s", "id": 1348, "pid": 2537909, "tid": 2537909, "ts": 6157602142265.716,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 4", "pid": 0, "tid": 7,
"ts": 6157602226971.429, "dur": 23414.392,
"args": {
"External id": 181
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 3", "pid": 0, "tid": 7,
"ts": 6157602203431.087, "dur": 23422.456,
"args": {
"External id": 136
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 2", "pid": 0, "tid": 7,
"ts": 6157602180249.849, "dur": 23063.224,
"args": {
"External id": 91
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 1", "pid": 0, "tid": 7,
"ts": 6157602158480.708, "dur": 21652.375,
"args": {
"External id": 46
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 0", "pid": 0, "tid": 7,
"ts": 6157602136504.304, "dur": 21858.838,
"args": {
"External id": 1
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0,
"args": {
"labels": "CPU"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0,
"args": {
"sort_index": 2537909
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0,
"args": {
"labels": "GPU 0"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0,
"args": {
"sort_index": 5000000
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0,
"args": {
"labels": "GPU 1"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0,
"args": {
"sort_index": 5000001
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0,
"args": {
"labels": "GPU 2"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0,
"args": {
"sort_index": 5000002
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0,
"args": {
"labels": "GPU 3"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0,
"args": {
"sort_index": 5000003
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0,
"args": {
"labels": "GPU 4"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0,
"args": {
"sort_index": 5000004
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0,
"args": {
"labels": "GPU 5"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0,
"args": {
"sort_index": 5000005
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0,
"args": {
"labels": "GPU 6"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0,
"args": {
"sort_index": 5000006
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0,
"args": {
"labels": "GPU 7"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0,
"args": {
"sort_index": 5000007
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0,
"args": {
"labels": "GPU 8"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0,
"args": {
"sort_index": 5000008
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0,
"args": {
"labels": "GPU 9"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0,
"args": {
"sort_index": 5000009
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0,
"args": {
"labels": "GPU 10"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0,
"args": {
"sort_index": 5000010
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0,
"args": {
"labels": "GPU 11"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0,
"args": {
"sort_index": 5000011
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0,
"args": {
"labels": "GPU 12"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0,
"args": {
"sort_index": 5000012
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0,
"args": {
"labels": "GPU 13"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0,
"args": {
"sort_index": 5000013
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0,
"args": {
"labels": "GPU 14"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0,
"args": {
"sort_index": 5000014
}
},
{
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0,
"args": {
"labels": "GPU 15"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0,
"args": {
"sort_index": 5000015
}
},
{
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 7,
"args": {
"name": "stream 7 "
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 7,
"args": {
"sort_index": 7
}
},
{
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200,
"args": {
"name": "thread 2544200 (pt_autograd_0)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200,
"args": {
"sort_index": 2544200
}
},
{
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200,
"args": {
"name": "thread 2544200 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200,
"args": {
"sort_index": 2544200
}
},
{
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2537909,
"args": {
"name": "thread 2537909 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2537909,
"args": {
"sort_index": 2537909
}
},
{
"ph": "X", "cat": "Trace", "ts": 6157602134629.026, "dur": 115917.113,
"pid": "Spans", "tid": "PyTorch Profiler",
"name": "PyTorch Profiler (0)",
"args": {
"Op count": 0
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6157602134629.026,
"pid": "Spans", "tid": 0,
"args": {
"sort_index": 536870912
}
},
{
"name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g",
"pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6157602134629.026
},
{
"name": "Record Window End", "ph": "i", "s": "g",
"pid": "", "tid": "", "ts": 6157602250920.914
}
],
"traceName": "/tmp/trace.json"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment