Created
June 11, 2025 22:00
-
-
Save shunting314/6e4572c8c0592403e55424ad1535ae3d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"schemaVersion": 1, | |
"deviceProperties": [ | |
{ | |
"id": 0, "name": "NVIDIA H100", "totalGlobalMem": 102010781696, | |
"computeMajor": 9, "computeMinor": 0, | |
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048, | |
"regsPerBlock": 65536, "warpSize": 32, | |
"sharedMemPerBlock": 49152, "numSms": 132 | |
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472 | |
} | |
], | |
"cupti_version": 24, | |
"cuda_runtime_version": 12060, | |
"cuda_driver_version": 12020, | |
"trace_id": "9C4A4A89A5964FBFA96ADF67B310C25E", | |
"displayTimeUnit": "ms", | |
"baseTimeNanoseconds": 1743521598000000000, | |
"traceEvents": [ | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602137737.466, "dur": 604.001, | |
"args": { | |
"External id": 513,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 0 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602137758.027, "dur": 533.565, | |
"args": { | |
"External id": 514,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1, "pid": 2537909, "tid": 2544200, "ts": 6157602137758.027, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138102.076, "dur": 84.077, | |
"args": { | |
"External id": 515,"Record function id": 0, "Ev Idx": 2 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138203.058, "dur": 13.671, | |
"args": { | |
"External id": 516,"Record function id": 0, "Ev Idx": 3 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138249.288, "dur": 11.087, | |
"args": { | |
"External id": 517,"Record function id": 0, "Ev Idx": 4 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138364.051, "dur": 31.888, | |
"args": { | |
"External id": 518,"Record function id": 0, "Ev Idx": 5 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138367.096, "dur": 26.309, | |
"args": { | |
"External id": 519,"Record function id": 0, "Ev Idx": 6 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138378.573, "dur": 13.070, | |
"args": { | |
"External id": 520,"Record function id": 0, "Ev Idx": 7 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138381.327, "dur": 10.176, | |
"args": { | |
"External id": 521,"Record function id": 0, "Ev Idx": 8 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138400.827, "dur": 5.268, | |
"args": { | |
"External id": 522,"Record function id": 0, "Ev Idx": 9 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138402.549, "dur": 2.744, | |
"args": { | |
"External id": 523,"Record function id": 0, "Ev Idx": 10 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138403.250, "dur": 1.092, | |
"args": { | |
"External id": 524,"Record function id": 0, "Ev Idx": 11 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138403.521, "dur": 0.691, | |
"args": { | |
"External id": 525,"Record function id": 0, "Ev Idx": 12 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138410.351, "dur": 4.296, | |
"args": { | |
"External id": 526,"Record function id": 0, "Ev Idx": 13 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138411.663, "dur": 2.143, | |
"args": { | |
"External id": 527,"Record function id": 0, "Ev Idx": 14 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138412.504, "dur": 0.841, | |
"args": { | |
"External id": 528,"Record function id": 0, "Ev Idx": 15 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138412.694, "dur": 0.551, | |
"args": { | |
"External id": 529,"Record function id": 0, "Ev Idx": 16 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139216.717, "dur": 187.873, | |
"args": { | |
"External id": 530,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 17 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139219.601, "dur": 169.786, | |
"args": { | |
"External id": 531,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 18 | |
} | |
}, | |
{ | |
"ph": "f", "id": 2, "pid": 2537909, "tid": 2544200, "ts": 6157602139219.601, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139321.485, "dur": 21.272, | |
"args": { | |
"External id": 532,"Record function id": 0, "Ev Idx": 19 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139349.838, "dur": 8.823, | |
"args": { | |
"External id": 533,"Record function id": 0, "Ev Idx": 20 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139367.995, "dur": 8.483, | |
"args": { | |
"External id": 534,"Record function id": 0, "Ev Idx": 21 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139413.864, "dur": 8.603, | |
"args": { | |
"External id": 535,"Record function id": 0, "Ev Idx": 22 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139415.507, "dur": 5.819, | |
"args": { | |
"External id": 536,"Record function id": 0, "Ev Idx": 23 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139417.340, "dur": 3.164, | |
"args": { | |
"External id": 537,"Record function id": 0, "Ev Idx": 24 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139417.880, "dur": 2.504, | |
"args": { | |
"External id": 538,"Record function id": 0, "Ev Idx": 25 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139426.694, "dur": 3.816, | |
"args": { | |
"External id": 539,"Record function id": 0, "Ev Idx": 26 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139427.805, "dur": 2.003, | |
"args": { | |
"External id": 540,"Record function id": 0, "Ev Idx": 27 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139428.356, "dur": 1.122, | |
"args": { | |
"External id": 541,"Record function id": 0, "Ev Idx": 28 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139428.547, "dur": 0.831, | |
"args": { | |
"External id": 542,"Record function id": 0, "Ev Idx": 29 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139434.345, "dur": 3.716, | |
"args": { | |
"External id": 543,"Record function id": 0, "Ev Idx": 30 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139435.467, "dur": 1.953, | |
"args": { | |
"External id": 544,"Record function id": 0, "Ev Idx": 31 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139436.218, "dur": 0.901, | |
"args": { | |
"External id": 545,"Record function id": 0, "Ev Idx": 32 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139436.398, "dur": 0.621, | |
"args": { | |
"External id": 546,"Record function id": 0, "Ev Idx": 33 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140139.529, "dur": 182.335, | |
"args": { | |
"External id": 547,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 1, "Ev Idx": 34 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140141.973, "dur": 148.083, | |
"args": { | |
"External id": 548,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 1, "Ev Idx": 35 | |
} | |
}, | |
{ | |
"ph": "f", "id": 3, "pid": 2537909, "tid": 2544200, "ts": 6157602140141.973, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140230.666, "dur": 19.640, | |
"args": { | |
"External id": 549,"Record function id": 0, "Ev Idx": 36 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140255.994, "dur": 7.392, | |
"args": { | |
"External id": 550,"Record function id": 0, "Ev Idx": 37 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140271.858, "dur": 6.921, | |
"args": { | |
"External id": 551,"Record function id": 0, "Ev Idx": 38 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140331.288, "dur": 7.291, | |
"args": { | |
"External id": 552,"Record function id": 0, "Ev Idx": 39 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140332.860, "dur": 4.808, | |
"args": { | |
"External id": 553,"Record function id": 0, "Ev Idx": 40 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140334.453, "dur": 2.333, | |
"args": { | |
"External id": 554,"Record function id": 0, "Ev Idx": 41 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140334.974, "dur": 1.692, | |
"args": { | |
"External id": 555,"Record function id": 0, "Ev Idx": 42 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140342.725, "dur": 3.526, | |
"args": { | |
"External id": 556,"Record function id": 0, "Ev Idx": 43 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140343.877, "dur": 1.703, | |
"args": { | |
"External id": 557,"Record function id": 0, "Ev Idx": 44 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140344.418, "dur": 0.851, | |
"args": { | |
"External id": 558,"Record function id": 0, "Ev Idx": 45 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140344.608, "dur": 0.571, | |
"args": { | |
"External id": 559,"Record function id": 0, "Ev Idx": 46 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140350.006, "dur": 3.666, | |
"args": { | |
"External id": 560,"Record function id": 0, "Ev Idx": 47 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140351.098, "dur": 1.893, | |
"args": { | |
"External id": 561,"Record function id": 0, "Ev Idx": 48 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140351.859, "dur": 0.821, | |
"args": { | |
"External id": 562,"Record function id": 0, "Ev Idx": 49 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140352.049, "dur": 0.531, | |
"args": { | |
"External id": 563,"Record function id": 0, "Ev Idx": 50 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141048.339, "dur": 165.029, | |
"args": { | |
"External id": 564,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 1, "Ev Idx": 51 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141050.823, "dur": 149.005, | |
"args": { | |
"External id": 565,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 1, "Ev Idx": 52 | |
} | |
}, | |
{ | |
"ph": "f", "id": 4, "pid": 2537909, "tid": 2544200, "ts": 6157602141050.823, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141137.604, "dur": 19.850, | |
"args": { | |
"External id": 566,"Record function id": 0, "Ev Idx": 53 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141163.383, "dur": 8.022, | |
"args": { | |
"External id": 567,"Record function id": 0, "Ev Idx": 54 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141180.108, "dur": 7.391, | |
"args": { | |
"External id": 568,"Record function id": 0, "Ev Idx": 55 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141222.252, "dur": 6.910, | |
"args": { | |
"External id": 569,"Record function id": 0, "Ev Idx": 56 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141223.664, "dur": 4.547, | |
"args": { | |
"External id": 570,"Record function id": 0, "Ev Idx": 57 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141225.056, "dur": 2.604, | |
"args": { | |
"External id": 571,"Record function id": 0, "Ev Idx": 58 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141225.517, "dur": 2.023, | |
"args": { | |
"External id": 572,"Record function id": 0, "Ev Idx": 59 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141233.158, "dur": 3.586, | |
"args": { | |
"External id": 573,"Record function id": 0, "Ev Idx": 60 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141234.250, "dur": 1.763, | |
"args": { | |
"External id": 574,"Record function id": 0, "Ev Idx": 61 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141234.771, "dur": 0.931, | |
"args": { | |
"External id": 575,"Record function id": 0, "Ev Idx": 62 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141234.981, "dur": 0.621, | |
"args": { | |
"External id": 576,"Record function id": 0, "Ev Idx": 63 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141240.640, "dur": 3.695, | |
"args": { | |
"External id": 577,"Record function id": 0, "Ev Idx": 64 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141241.751, "dur": 1.773, | |
"args": { | |
"External id": 578,"Record function id": 0, "Ev Idx": 65 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141242.562, "dur": 0.661, | |
"args": { | |
"External id": 579,"Record function id": 0, "Ev Idx": 66 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141242.773, "dur": 0.350, | |
"args": { | |
"External id": 580,"Record function id": 0, "Ev Idx": 67 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141950.821, "dur": 165.509, | |
"args": { | |
"External id": 581,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 68 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141953.054, "dur": 149.405, | |
"args": { | |
"External id": 582,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 1, "Ev Idx": 69 | |
} | |
}, | |
{ | |
"ph": "f", "id": 5, "pid": 2537909, "tid": 2544200, "ts": 6157602141953.054, | |
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142042.218, "dur": 19.079, | |
"args": { | |
"External id": 583,"Record function id": 0, "Ev Idx": 70 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142067.226, "dur": 7.822, | |
"args": { | |
"External id": 584,"Record function id": 0, "Ev Idx": 71 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142083.020, "dur": 7.812, | |
"args": { | |
"External id": 585,"Record function id": 0, "Ev Idx": 72 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142125.103, "dur": 6.640, | |
"args": { | |
"External id": 586,"Record function id": 0, "Ev Idx": 73 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142126.385, "dur": 4.337, | |
"args": { | |
"External id": 587,"Record function id": 0, "Ev Idx": 74 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142127.797, "dur": 2.454, | |
"args": { | |
"External id": 588,"Record function id": 0, "Ev Idx": 75 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142128.358, "dur": 1.773, | |
"args": { | |
"External id": 589,"Record function id": 0, "Ev Idx": 76 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142136.431, "dur": 3.555, | |
"args": { | |
"External id": 590,"Record function id": 0, "Ev Idx": 77 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142137.482, "dur": 1.813, | |
"args": { | |
"External id": 591,"Record function id": 0, "Ev Idx": 78 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142137.993, "dur": 0.841, | |
"args": { | |
"External id": 592,"Record function id": 0, "Ev Idx": 79 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142138.163, "dur": 0.571, | |
"args": { | |
"External id": 593,"Record function id": 0, "Ev Idx": 80 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142143.742, "dur": 3.615, | |
"args": { | |
"External id": 594,"Record function id": 0, "Ev Idx": 81 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142144.863, "dur": 1.863, | |
"args": { | |
"External id": 595,"Record function id": 0, "Ev Idx": 82 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142145.594, "dur": 0.852, | |
"args": { | |
"External id": 596,"Record function id": 0, "Ev Idx": 83 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142145.805, "dur": 0.550, | |
"args": { | |
"External id": 597,"Record function id": 0, "Ev Idx": 84 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "user_annotation", "name": "Step 0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602134832.545, "dur": 3689.324, | |
"args": { | |
"External id": 1,"Record function id": 0, "Ev Idx": 85 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602134895.550, "dur": 67.402, | |
"args": { | |
"External id": 2,"Record function id": 0, "Ev Idx": 86 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602134966.146, "dur": 3489.453, | |
"args": { | |
"External id": 3,"Record function id": 0, "Ev Idx": 87 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602134982.221, "dur": 5.037, | |
"args": { | |
"External id": 4,"Record function id": 0, "Ev Idx": 88 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602135062.402, "dur": 2277.555, | |
"args": { | |
"External id": 5,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 0, "Ev Idx": 89 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1, "pid": 2537909, "tid": 2537909, "ts": 6157602135062.402, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136439.418, "dur": 67.392, | |
"args": { | |
"External id": 6,"Record function id": 0, "Ev Idx": 90 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136528.713, "dur": 14.622, | |
"args": { | |
"External id": 7,"Record function id": 0, "Ev Idx": 91 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136564.988, "dur": 13.400, | |
"args": { | |
"External id": 8,"Record function id": 0, "Ev Idx": 92 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136598.328, "dur": 43.376, | |
"args": { | |
"External id": 9,"Record function id": 0, "Ev Idx": 93 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136608.584, "dur": 30.466, | |
"args": { | |
"External id": 10,"Record function id": 0, "Ev Idx": 94 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136615.214, "dur": 18.948, | |
"args": { | |
"External id": 11,"Record function id": 0, "Ev Idx": 95 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136624.958, "dur": 8.773, | |
"args": { | |
"External id": 12,"Record function id": 0, "Ev Idx": 96 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136628.484, "dur": 3.625, | |
"args": { | |
"External id": 13,"Record function id": 0, "Ev Idx": 97 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136651.659, "dur": 3.525, | |
"args": { | |
"External id": 14,"Record function id": 0, "Ev Idx": 98 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136652.039, "dur": 2.864, | |
"args": { | |
"External id": 15,"Record function id": 0, "Ev Idx": 99 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136652.530, "dur": 1.612, | |
"args": { | |
"External id": 16,"Record function id": 0, "Ev Idx": 100 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136652.981, "dur": 0.991, | |
"args": { | |
"External id": 17,"Record function id": 0, "Ev Idx": 101 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136653.411, "dur": 0.311, | |
"args": { | |
"External id": 18,"Record function id": 0, "Ev Idx": 102 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136658.539, "dur": 2.964, | |
"args": { | |
"External id": 19,"Record function id": 0, "Ev Idx": 103 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136658.839, "dur": 2.434, | |
"args": { | |
"External id": 20,"Record function id": 0, "Ev Idx": 104 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136659.290, "dur": 1.192, | |
"args": { | |
"External id": 21,"Record function id": 0, "Ev Idx": 105 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136659.611, "dur": 0.741, | |
"args": { | |
"External id": 22,"Record function id": 0, "Ev Idx": 106 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136659.911, "dur": 0.260, | |
"args": { | |
"External id": 23,"Record function id": 0, "Ev Idx": 107 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136664.658, "dur": 2.734, | |
"args": { | |
"External id": 24,"Record function id": 0, "Ev Idx": 108 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136664.959, "dur": 2.213, | |
"args": { | |
"External id": 25,"Record function id": 0, "Ev Idx": 109 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136665.319, "dur": 1.082, | |
"args": { | |
"External id": 26,"Record function id": 0, "Ev Idx": 110 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136665.640, "dur": 0.651, | |
"args": { | |
"External id": 27,"Record function id": 0, "Ev Idx": 111 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136665.910, "dur": 0.210, | |
"args": { | |
"External id": 28,"Record function id": 0, "Ev Idx": 112 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136684.228, "dur": 13.360, | |
"args": { | |
"External id": 29,"Record function id": 0, "Ev Idx": 113 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136710.828, "dur": 10.936, | |
"args": { | |
"External id": 30,"Record function id": 0, "Ev Idx": 114 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136734.103, "dur": 13.240, | |
"args": { | |
"External id": 31,"Record function id": 0, "Ev Idx": 115 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136785.701, "dur": 14.782, | |
"args": { | |
"External id": 32,"Record function id": 0, "Ev Idx": 116 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136812.121, "dur": 106.931, | |
"args": { | |
"External id": 33,"Record function id": 0, "Ev Idx": 117 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136928.987, "dur": 4.737, | |
"args": { | |
"External id": 34,"Record function id": 0, "Ev Idx": 118 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136953.654, "dur": 23.355, | |
"args": { | |
"External id": 35,"Record function id": 0, "Ev Idx": 119 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136991.711, "dur": 11.257, | |
"args": { | |
"External id": 36,"Record function id": 0, "Ev Idx": 120 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137008.467, "dur": 31.507, | |
"args": { | |
"External id": 37,"Record function id": 0, "Ev Idx": 121 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137058.913, "dur": 15.553, | |
"args": { | |
"External id": 38,"Record function id": 0, "Ev Idx": 122 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137087.326, "dur": 101.903, | |
"args": { | |
"External id": 39,"Record function id": 0, "Ev Idx": 123 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137220.446, "dur": 14.472, | |
"args": { | |
"External id": 40,"Record function id": 0, "Ev Idx": 124 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137249.871, "dur": 10.846, | |
"args": { | |
"External id": 41,"Record function id": 0, "Ev Idx": 125 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137401.890, "dur": 42.925, | |
"args": { | |
"External id": 42,"Record function id": 0, "Ev Idx": 126 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137403.963, "dur": 14.322, | |
"args": { | |
"External id": 43,"Record function id": 0, "Ev Idx": 127 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137410.293, "dur": 7.451, | |
"args": { | |
"External id": 44,"Record function id": 0, "Ev Idx": 128 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137423.082, "dur": 21.492, | |
"args": { | |
"External id": 45,"Record function id": 0, "Ev Idx": 129 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "user_annotation", "name": "Step 1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138535.910, "dur": 964.845, | |
"args": { | |
"External id": 46,"Record function id": 0, "Ev Idx": 130 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138560.968, "dur": 22.885, | |
"args": { | |
"External id": 47,"Record function id": 0, "Ev Idx": 131 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138584.474, "dur": 880.317, | |
"args": { | |
"External id": 48,"Record function id": 0, "Ev Idx": 132 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138590.062, "dur": 1.753, | |
"args": { | |
"External id": 49,"Record function id": 0, "Ev Idx": 133 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138622.571, "dur": 513.725, | |
"args": { | |
"External id": 50,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 0, "Ev Idx": 134 | |
} | |
}, | |
{ | |
"ph": "s", "id": 2, "pid": 2537909, "tid": 2537909, "ts": 6157602138622.571, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138683.313, "dur": 26.900, | |
"args": { | |
"External id": 51,"Record function id": 0, "Ev Idx": 135 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138720.138, "dur": 10.045, | |
"args": { | |
"External id": 52,"Record function id": 0, "Ev Idx": 136 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138738.967, "dur": 8.232, | |
"args": { | |
"External id": 53,"Record function id": 0, "Ev Idx": 137 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138755.672, "dur": 12.078, | |
"args": { | |
"External id": 54,"Record function id": 0, "Ev Idx": 138 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138757.404, "dur": 9.405, | |
"args": { | |
"External id": 55,"Record function id": 0, "Ev Idx": 139 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138759.638, "dur": 5.478, | |
"args": { | |
"External id": 56,"Record function id": 0, "Ev Idx": 140 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138761.250, "dur": 3.535, | |
"args": { | |
"External id": 57,"Record function id": 0, "Ev Idx": 141 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138762.422, "dur": 1.713, | |
"args": { | |
"External id": 58,"Record function id": 0, "Ev Idx": 142 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138774.039, "dur": 3.656, | |
"args": { | |
"External id": 59,"Record function id": 0, "Ev Idx": 143 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138774.530, "dur": 2.844, | |
"args": { | |
"External id": 60,"Record function id": 0, "Ev Idx": 144 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138774.971, "dur": 1.833, | |
"args": { | |
"External id": 61,"Record function id": 0, "Ev Idx": 145 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138775.341, "dur": 1.272, | |
"args": { | |
"External id": 62,"Record function id": 0, "Ev Idx": 146 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138775.812, "dur": 0.541, | |
"args": { | |
"External id": 63,"Record function id": 0, "Ev Idx": 147 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138780.850, "dur": 3.154, | |
"args": { | |
"External id": 64,"Record function id": 0, "Ev Idx": 148 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138781.150, "dur": 2.604, | |
"args": { | |
"External id": 65,"Record function id": 0, "Ev Idx": 149 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138781.531, "dur": 1.562, | |
"args": { | |
"External id": 66,"Record function id": 0, "Ev Idx": 150 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138781.911, "dur": 1.042, | |
"args": { | |
"External id": 67,"Record function id": 0, "Ev Idx": 151 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138782.322, "dur": 0.431, | |
"args": { | |
"External id": 68,"Record function id": 0, "Ev Idx": 152 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138786.929, "dur": 2.714, | |
"args": { | |
"External id": 69,"Record function id": 0, "Ev Idx": 153 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138787.279, "dur": 2.134, | |
"args": { | |
"External id": 70,"Record function id": 0, "Ev Idx": 154 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138787.620, "dur": 1.352, | |
"args": { | |
"External id": 71,"Record function id": 0, "Ev Idx": 155 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138787.940, "dur": 0.872, | |
"args": { | |
"External id": 72,"Record function id": 0, "Ev Idx": 156 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138788.211, "dur": 0.430, | |
"args": { | |
"External id": 73,"Record function id": 0, "Ev Idx": 157 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138800.559, "dur": 9.314, | |
"args": { | |
"External id": 74,"Record function id": 0, "Ev Idx": 158 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138816.884, "dur": 7.161, | |
"args": { | |
"External id": 75,"Record function id": 0, "Ev Idx": 159 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138830.465, "dur": 6.880, | |
"args": { | |
"External id": 76,"Record function id": 0, "Ev Idx": 160 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138861.942, "dur": 10.235, | |
"args": { | |
"External id": 77,"Record function id": 0, "Ev Idx": 161 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138876.865, "dur": 39.028, | |
"args": { | |
"External id": 78,"Record function id": 0, "Ev Idx": 162 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138923.144, "dur": 2.935, | |
"args": { | |
"External id": 79,"Record function id": 0, "Ev Idx": 163 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138937.756, "dur": 14.903, | |
"args": { | |
"External id": 80,"Record function id": 0, "Ev Idx": 164 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138961.041, "dur": 7.682, | |
"args": { | |
"External id": 81,"Record function id": 0, "Ev Idx": 165 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138973.981, "dur": 22.023, | |
"args": { | |
"External id": 82,"Record function id": 0, "Ev Idx": 166 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139007.992, "dur": 9.294, | |
"args": { | |
"External id": 83,"Record function id": 0, "Ev Idx": 167 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139024.477, "dur": 34.993, | |
"args": { | |
"External id": 84,"Record function id": 0, "Ev Idx": 168 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139078.609, "dur": 10.175, | |
"args": { | |
"External id": 85,"Record function id": 0, "Ev Idx": 169 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139095.604, "dur": 7.532, | |
"args": { | |
"External id": 86,"Record function id": 0, "Ev Idx": 170 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139162.445, "dur": 18.027, | |
"args": { | |
"External id": 87,"Record function id": 0, "Ev Idx": 171 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139163.066, "dur": 5.529, | |
"args": { | |
"External id": 88,"Record function id": 0, "Ev Idx": 172 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139163.927, "dur": 4.237, | |
"args": { | |
"External id": 89,"Record function id": 0, "Ev Idx": 173 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139169.245, "dur": 10.987, | |
"args": { | |
"External id": 90,"Record function id": 0, "Ev Idx": 174 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "user_annotation", "name": "Step 2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139510.931, "dur": 902.441, | |
"args": { | |
"External id": 91,"Record function id": 0, "Ev Idx": 175 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139530.260, "dur": 14.562, | |
"args": { | |
"External id": 92,"Record function id": 0, "Ev Idx": 176 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139545.353, "dur": 834.088, | |
"args": { | |
"External id": 93,"Record function id": 0, "Ev Idx": 177 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139549.940, "dur": 1.852, | |
"args": { | |
"External id": 94,"Record function id": 0, "Ev Idx": 178 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139577.691, "dur": 486.835, | |
"args": { | |
"External id": 95,"Record function id": 0, "Sequence number": 132, "Fwd thread id": 0, "Ev Idx": 179 | |
} | |
}, | |
{ | |
"ph": "s", "id": 3, "pid": 2537909, "tid": 2537909, "ts": 6157602139577.691, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139633.736, "dur": 24.968, | |
"args": { | |
"External id": 96,"Record function id": 0, "Ev Idx": 180 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139666.906, "dur": 9.835, | |
"args": { | |
"External id": 97,"Record function id": 0, "Ev Idx": 181 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139685.103, "dur": 7.832, | |
"args": { | |
"External id": 98,"Record function id": 0, "Ev Idx": 182 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139701.128, "dur": 11.497, | |
"args": { | |
"External id": 99,"Record function id": 0, "Ev Idx": 183 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139702.800, "dur": 8.844, | |
"args": { | |
"External id": 100,"Record function id": 0, "Ev Idx": 184 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139704.843, "dur": 5.278, | |
"args": { | |
"External id": 101,"Record function id": 0, "Ev Idx": 185 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139706.436, "dur": 3.355, | |
"args": { | |
"External id": 102,"Record function id": 0, "Ev Idx": 186 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139707.587, "dur": 1.573, | |
"args": { | |
"External id": 103,"Record function id": 0, "Ev Idx": 187 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139719.485, "dur": 3.566, | |
"args": { | |
"External id": 104,"Record function id": 0, "Ev Idx": 188 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139719.886, "dur": 2.704, | |
"args": { | |
"External id": 105,"Record function id": 0, "Ev Idx": 189 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139720.357, "dur": 1.692, | |
"args": { | |
"External id": 106,"Record function id": 0, "Ev Idx": 190 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139720.757, "dur": 1.112, | |
"args": { | |
"External id": 107,"Record function id": 0, "Ev Idx": 191 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139721.108, "dur": 0.531, | |
"args": { | |
"External id": 108,"Record function id": 0, "Ev Idx": 192 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139726.175, "dur": 2.855, | |
"args": { | |
"External id": 109,"Record function id": 0, "Ev Idx": 193 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139726.476, "dur": 2.243, | |
"args": { | |
"External id": 110,"Record function id": 0, "Ev Idx": 194 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139726.997, "dur": 1.211, | |
"args": { | |
"External id": 111,"Record function id": 0, "Ev Idx": 195 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139727.337, "dur": 0.741, | |
"args": { | |
"External id": 112,"Record function id": 0, "Ev Idx": 196 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139727.658, "dur": 0.240, | |
"args": { | |
"External id": 113,"Record function id": 0, "Ev Idx": 197 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139732.124, "dur": 2.424, | |
"args": { | |
"External id": 114,"Record function id": 0, "Ev Idx": 198 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139732.415, "dur": 1.923, | |
"args": { | |
"External id": 115,"Record function id": 0, "Ev Idx": 199 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139732.735, "dur": 1.102, | |
"args": { | |
"External id": 116,"Record function id": 0, "Ev Idx": 200 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139733.066, "dur": 0.651, | |
"args": { | |
"External id": 117,"Record function id": 0, "Ev Idx": 201 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139733.346, "dur": 0.191, | |
"args": { | |
"External id": 118,"Record function id": 0, "Ev Idx": 202 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139744.403, "dur": 9.214, | |
"args": { | |
"External id": 119,"Record function id": 0, "Ev Idx": 203 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139759.996, "dur": 7.542, | |
"args": { | |
"External id": 120,"Record function id": 0, "Ev Idx": 204 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139774.558, "dur": 6.701, | |
"args": { | |
"External id": 121,"Record function id": 0, "Ev Idx": 205 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139805.064, "dur": 9.294, | |
"args": { | |
"External id": 122,"Record function id": 0, "Ev Idx": 206 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139818.765, "dur": 33.851, | |
"args": { | |
"External id": 123,"Record function id": 0, "Ev Idx": 207 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139859.396, "dur": 2.744, | |
"args": { | |
"External id": 124,"Record function id": 0, "Ev Idx": 208 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139873.448, "dur": 14.722, | |
"args": { | |
"External id": 125,"Record function id": 0, "Ev Idx": 209 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139896.472, "dur": 7.121, | |
"args": { | |
"External id": 126,"Record function id": 0, "Ev Idx": 210 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139908.470, "dur": 20.992, | |
"args": { | |
"External id": 127,"Record function id": 0, "Ev Idx": 211 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139941.170, "dur": 9.584, | |
"args": { | |
"External id": 128,"Record function id": 0, "Ev Idx": 212 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139957.454, "dur": 33.280, | |
"args": { | |
"External id": 129,"Record function id": 0, "Ev Idx": 213 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140009.573, "dur": 10.496, | |
"args": { | |
"External id": 130,"Record function id": 0, "Ev Idx": 214 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140026.268, "dur": 7.461, | |
"args": { | |
"External id": 131,"Record function id": 0, "Ev Idx": 215 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140088.332, "dur": 17.837, | |
"args": { | |
"External id": 132,"Record function id": 0, "Ev Idx": 216 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140089.063, "dur": 5.658, | |
"args": { | |
"External id": 133,"Record function id": 0, "Ev Idx": 217 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140089.994, "dur": 4.347, | |
"args": { | |
"External id": 134,"Record function id": 0, "Ev Idx": 218 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140095.332, "dur": 10.606, | |
"args": { | |
"External id": 135,"Record function id": 0, "Ev Idx": 219 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "user_annotation", "name": "Step 3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140423.197, "dur": 889.161, | |
"args": { | |
"External id": 136,"Record function id": 0, "Ev Idx": 220 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140442.916, "dur": 15.454, | |
"args": { | |
"External id": 137,"Record function id": 0, "Ev Idx": 221 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140459.241, "dur": 812.065, | |
"args": { | |
"External id": 138,"Record function id": 0, "Ev Idx": 222 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140463.227, "dur": 1.602, | |
"args": { | |
"External id": 139,"Record function id": 0, "Ev Idx": 223 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140490.488, "dur": 480.174, | |
"args": { | |
"External id": 140,"Record function id": 0, "Sequence number": 133, "Fwd thread id": 0, "Ev Idx": 224 | |
} | |
}, | |
{ | |
"ph": "s", "id": 4, "pid": 2537909, "tid": 2537909, "ts": 6157602140490.488, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140543.789, "dur": 25.198, | |
"args": { | |
"External id": 141,"Record function id": 0, "Ev Idx": 225 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140578.261, "dur": 9.444, | |
"args": { | |
"External id": 142,"Record function id": 0, "Ev Idx": 226 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140595.947, "dur": 7.642, | |
"args": { | |
"External id": 143,"Record function id": 0, "Ev Idx": 227 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140611.431, "dur": 11.116, | |
"args": { | |
"External id": 144,"Record function id": 0, "Ev Idx": 228 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140613.143, "dur": 8.553, | |
"args": { | |
"External id": 145,"Record function id": 0, "Ev Idx": 229 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140615.026, "dur": 5.178, | |
"args": { | |
"External id": 146,"Record function id": 0, "Ev Idx": 230 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140616.819, "dur": 3.074, | |
"args": { | |
"External id": 147,"Record function id": 0, "Ev Idx": 231 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140617.860, "dur": 1.422, | |
"args": { | |
"External id": 148,"Record function id": 0, "Ev Idx": 232 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140629.197, "dur": 3.456, | |
"args": { | |
"External id": 149,"Record function id": 0, "Ev Idx": 233 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140629.598, "dur": 2.774, | |
"args": { | |
"External id": 150,"Record function id": 0, "Ev Idx": 234 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140630.039, "dur": 1.802, | |
"args": { | |
"External id": 151,"Record function id": 0, "Ev Idx": 235 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140630.469, "dur": 1.172, | |
"args": { | |
"External id": 152,"Record function id": 0, "Ev Idx": 236 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140630.820, "dur": 0.571, | |
"args": { | |
"External id": 153,"Record function id": 0, "Ev Idx": 237 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140635.797, "dur": 2.714, | |
"args": { | |
"External id": 154,"Record function id": 0, "Ev Idx": 238 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140636.078, "dur": 2.183, | |
"args": { | |
"External id": 155,"Record function id": 0, "Ev Idx": 239 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140636.488, "dur": 1.282, | |
"args": { | |
"External id": 156,"Record function id": 0, "Ev Idx": 240 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140636.829, "dur": 0.811, | |
"args": { | |
"External id": 157,"Record function id": 0, "Ev Idx": 241 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140637.189, "dur": 0.271, | |
"args": { | |
"External id": 158,"Record function id": 0, "Ev Idx": 242 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140641.376, "dur": 2.704, | |
"args": { | |
"External id": 159,"Record function id": 0, "Ev Idx": 243 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140641.676, "dur": 2.193, | |
"args": { | |
"External id": 160,"Record function id": 0, "Ev Idx": 244 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140642.017, "dur": 1.382, | |
"args": { | |
"External id": 161,"Record function id": 0, "Ev Idx": 245 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140642.357, "dur": 0.892, | |
"args": { | |
"External id": 162,"Record function id": 0, "Ev Idx": 246 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140642.648, "dur": 0.430, | |
"args": { | |
"External id": 163,"Record function id": 0, "Ev Idx": 247 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140653.344, "dur": 8.563, | |
"args": { | |
"External id": 164,"Record function id": 0, "Ev Idx": 248 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140668.306, "dur": 6.951, | |
"args": { | |
"External id": 165,"Record function id": 0, "Ev Idx": 249 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140681.947, "dur": 6.920, | |
"args": { | |
"External id": 166,"Record function id": 0, "Ev Idx": 250 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140713.685, "dur": 8.032, | |
"args": { | |
"External id": 167,"Record function id": 0, "Ev Idx": 251 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140725.823, "dur": 36.505, | |
"args": { | |
"External id": 168,"Record function id": 0, "Ev Idx": 252 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140769.118, "dur": 2.835, | |
"args": { | |
"External id": 169,"Record function id": 0, "Ev Idx": 253 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140783.460, "dur": 13.841, | |
"args": { | |
"External id": 170,"Record function id": 0, "Ev Idx": 254 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140804.181, "dur": 6.460, | |
"args": { | |
"External id": 171,"Record function id": 0, "Ev Idx": 255 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140815.699, "dur": 21.152, | |
"args": { | |
"External id": 172,"Record function id": 0, "Ev Idx": 256 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140848.027, "dur": 8.714, | |
"args": { | |
"External id": 173,"Record function id": 0, "Ev Idx": 257 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140863.411, "dur": 33.280, | |
"args": { | |
"External id": 174,"Record function id": 0, "Ev Idx": 258 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140914.908, "dur": 10.646, | |
"args": { | |
"External id": 175,"Record function id": 0, "Ev Idx": 259 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140932.094, "dur": 7.822, | |
"args": { | |
"External id": 176,"Record function id": 0, "Ev Idx": 260 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140995.400, "dur": 18.057, | |
"args": { | |
"External id": 177,"Record function id": 0, "Ev Idx": 261 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140996.011, "dur": 5.728, | |
"args": { | |
"External id": 178,"Record function id": 0, "Ev Idx": 262 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140996.952, "dur": 4.357, | |
"args": { | |
"External id": 179,"Record function id": 0, "Ev Idx": 263 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141002.340, "dur": 10.897, | |
"args": { | |
"External id": 180,"Record function id": 0, "Ev Idx": 264 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "user_annotation", "name": "Step 4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141322.403, "dur": 884.123, | |
"args": { | |
"External id": 181,"Record function id": 0, "Ev Idx": 265 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141341.171, "dur": 15.674, | |
"args": { | |
"External id": 182,"Record function id": 0, "Ev Idx": 266 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141357.386, "dur": 815.660, | |
"args": { | |
"External id": 183,"Record function id": 0, "Ev Idx": 267 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141361.312, "dur": 1.382, | |
"args": { | |
"External id": 184,"Record function id": 0, "Ev Idx": 268 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141388.873, "dur": 483.830, | |
"args": { | |
"External id": 185,"Record function id": 0, "Sequence number": 134, "Fwd thread id": 0, "Ev Idx": 269 | |
} | |
}, | |
{ | |
"ph": "s", "id": 5, "pid": 2537909, "tid": 2537909, "ts": 6157602141388.873, | |
"cat": "fwdbwd", "name": "fwdbwd" | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141442.574, "dur": 24.597, | |
"args": { | |
"External id": 186,"Record function id": 0, "Ev Idx": 270 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141476.425, "dur": 9.014, | |
"args": { | |
"External id": 187,"Record function id": 0, "Ev Idx": 271 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141493.992, "dur": 8.232, | |
"args": { | |
"External id": 188,"Record function id": 0, "Ev Idx": 272 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141510.096, "dur": 10.956, | |
"args": { | |
"External id": 189,"Record function id": 0, "Ev Idx": 273 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141511.618, "dur": 8.603, | |
"args": { | |
"External id": 190,"Record function id": 0, "Ev Idx": 274 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141513.391, "dur": 5.328, | |
"args": { | |
"External id": 191,"Record function id": 0, "Ev Idx": 275 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141515.404, "dur": 2.964, | |
"args": { | |
"External id": 192,"Record function id": 0, "Ev Idx": 276 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141516.566, "dur": 1.161, | |
"args": { | |
"External id": 193,"Record function id": 0, "Ev Idx": 277 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141527.572, "dur": 3.676, | |
"args": { | |
"External id": 194,"Record function id": 0, "Ev Idx": 278 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141527.973, "dur": 2.964, | |
"args": { | |
"External id": 195,"Record function id": 0, "Ev Idx": 279 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141528.404, "dur": 1.993, | |
"args": { | |
"External id": 196,"Record function id": 0, "Ev Idx": 280 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141528.954, "dur": 1.272, | |
"args": { | |
"External id": 197,"Record function id": 0, "Ev Idx": 281 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141529.395, "dur": 0.591, | |
"args": { | |
"External id": 198,"Record function id": 0, "Ev Idx": 282 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141534.543, "dur": 2.514, | |
"args": { | |
"External id": 199,"Record function id": 0, "Ev Idx": 283 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141534.833, "dur": 2.013, | |
"args": { | |
"External id": 200,"Record function id": 0, "Ev Idx": 284 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141535.214, "dur": 1.121, | |
"args": { | |
"External id": 201,"Record function id": 0, "Ev Idx": 285 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141535.554, "dur": 0.651, | |
"args": { | |
"External id": 202,"Record function id": 0, "Ev Idx": 286 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141535.835, "dur": 0.200, | |
"args": { | |
"External id": 203,"Record function id": 0, "Ev Idx": 287 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141539.901, "dur": 2.864, | |
"args": { | |
"External id": 204,"Record function id": 0, "Ev Idx": 288 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141540.261, "dur": 2.284, | |
"args": { | |
"External id": 205,"Record function id": 0, "Ev Idx": 289 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141540.582, "dur": 1.512, | |
"args": { | |
"External id": 206,"Record function id": 0, "Ev Idx": 290 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141541.083, "dur": 0.861, | |
"args": { | |
"External id": 207,"Record function id": 0, "Ev Idx": 291 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141541.393, "dur": 0.391, | |
"args": { | |
"External id": 208,"Record function id": 0, "Ev Idx": 292 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141553.311, "dur": 9.044, | |
"args": { | |
"External id": 209,"Record function id": 0, "Ev Idx": 293 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141570.006, "dur": 7.592, | |
"args": { | |
"External id": 210,"Record function id": 0, "Ev Idx": 294 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141583.687, "dur": 5.919, | |
"args": { | |
"External id": 211,"Record function id": 0, "Ev Idx": 295 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141613.792, "dur": 9.064, | |
"args": { | |
"External id": 212,"Record function id": 0, "Ev Idx": 296 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141627.523, "dur": 32.940, | |
"args": { | |
"External id": 213,"Record function id": 0, "Ev Idx": 297 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141667.233, "dur": 2.854, | |
"args": { | |
"External id": 214,"Record function id": 0, "Ev Idx": 298 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141681.514, "dur": 15.424, | |
"args": { | |
"External id": 215,"Record function id": 0, "Ev Idx": 299 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141704.259, "dur": 7.100, | |
"args": { | |
"External id": 216,"Record function id": 0, "Ev Idx": 300 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141716.647, "dur": 20.802, | |
"args": { | |
"External id": 217,"Record function id": 0, "Ev Idx": 301 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141748.686, "dur": 8.613, | |
"args": { | |
"External id": 218,"Record function id": 0, "Ev Idx": 302 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141763.809, "dur": 32.809, | |
"args": { | |
"External id": 219,"Record function id": 0, "Ev Idx": 303 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_10", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141816.368, "dur": 10.195, | |
"args": { | |
"External id": 220,"Record function id": 0, "Ev Idx": 304 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_11", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141833.954, "dur": 7.391, | |
"args": { | |
"External id": 221,"Record function id": 0, "Ev Idx": 305 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141899.603, "dur": 17.186, | |
"args": { | |
"External id": 222,"Record function id": 0, "Ev Idx": 306 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141900.244, "dur": 5.378, | |
"args": { | |
"External id": 223,"Record function id": 0, "Ev Idx": 307 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141901.066, "dur": 4.136, | |
"args": { | |
"External id": 224,"Record function id": 0, "Ev Idx": 308 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141906.223, "dur": 10.346, | |
"args": { | |
"External id": 225,"Record function id": 0, "Ev Idx": 309 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "overhead", "name": "Unrecognized", "pid": -1, "tid": 0, | |
"ts": 6157602135110.995, "dur": 1241.643 | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6157602136504.305, "dur": 7.359, | |
"args": { | |
"External id": 6, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 30, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 30, "pid": 0, "tid": 7, "ts": 6157602136504.305, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136469.364, "dur": 36.254, | |
"args": { | |
"External id": 6, "cbid": 307, "correlation": 30 | |
} | |
}, | |
{ | |
"ph": "s", "id": 30, "pid": 2537909, "tid": 2537909, "ts": 6157602136469.364, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6157602136543.888, "dur": 1.441, | |
"args": { | |
"External id": 7, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 38, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 38, "pid": 0, "tid": 7, "ts": 6157602136543.888, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136536.084, "dur": 6.340, | |
"args": { | |
"External id": 7, "cbid": 307, "correlation": 38 | |
} | |
}, | |
{ | |
"ph": "s", "id": 38, "pid": 2537909, "tid": 2537909, "ts": 6157602136536.084, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602136579.248, "dur": 47.200, | |
"args": { | |
"External id": 8, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 46, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 46, "pid": 0, "tid": 7, "ts": 6157602136579.248, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136572.539, "dur": 5.108, | |
"args": { | |
"External id": 8, "cbid": 307, "correlation": 46 | |
} | |
}, | |
{ | |
"ph": "s", "id": 46, "pid": 2537909, "tid": 2537909, "ts": 6157602136572.539, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6157602136698.897, "dur": 1.536, | |
"args": { | |
"External id": 29, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 53, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 53, "pid": 0, "tid": 7, "ts": 6157602136698.897, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136690.607, "dur": 6.190, | |
"args": { | |
"External id": 29, "cbid": 307, "correlation": 53 | |
} | |
}, | |
{ | |
"ph": "s", "id": 53, "pid": 2537909, "tid": 2537909, "ts": 6157602136690.607, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6157602136722.865, "dur": 31.520, | |
"args": { | |
"External id": 30, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 60, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 60, "pid": 0, "tid": 7, "ts": 6157602136722.865, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136716.707, "dur": 4.546, | |
"args": { | |
"External id": 30, "cbid": 307, "correlation": 60 | |
} | |
}, | |
{ | |
"ph": "s", "id": 60, "pid": 2537909, "tid": 2537909, "ts": 6157602136716.707, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6157602136755.409, "dur": 1.376, | |
"args": { | |
"External id": 31, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 67, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 67, "pid": 0, "tid": 7, "ts": 6157602136755.409, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136739.451, "dur": 7.461, | |
"args": { | |
"External id": 31, "cbid": 307, "correlation": 67 | |
} | |
}, | |
{ | |
"ph": "s", "id": 67, "pid": 2537909, "tid": 2537909, "ts": 6157602136739.451, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6157602136801.745, "dur": 109.024, | |
"args": { | |
"External id": 32, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 82, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 82, "pid": 0, "tid": 7, "ts": 6157602136801.745, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136794.474, "dur": 5.518, | |
"args": { | |
"External id": 32, "cbid": 307, "correlation": 82 | |
} | |
}, | |
{ | |
"ph": "s", "id": 82, "pid": 2537909, "tid": 2537909, "ts": 6157602136794.474, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602136913.649, "dur": 0.800, | |
"args": { | |
"External id": 33, "device": 0, "context": 1, "stream": 7, "correlation": 97, "bytes": 4, "memory bandwidth (GB/s)": 0.005 | |
} | |
}, | |
{ | |
"ph": "f", "id": 97, "pid": 0, "tid": 7, "ts": 6157602136913.649, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136895.056, "dur": 13.740, | |
"args": { | |
"External id": 33, "cbid": 51, "correlation": 97 | |
} | |
}, | |
{ | |
"ph": "s", "id": 97, "pid": 2537909, "tid": 2537909, "ts": 6157602136895.056, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7, | |
"ts": 6157602136917.937, "dur": 4324.740, | |
"args": { | |
"External id": 33, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 98, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 98, "pid": 0, "tid": 7, "ts": 6157602136917.937, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136909.197, "dur": 7.251, | |
"args": { | |
"External id": 33, "cbid": 652, "correlation": 98 | |
} | |
}, | |
{ | |
"ph": "s", "id": 98, "pid": 2537909, "tid": 2537909, "ts": 6157602136909.197, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7, | |
"ts": 6157602141243.893, "dur": 5738.277, | |
"args": { | |
"External id": 35, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 119, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 119, "pid": 0, "tid": 7, "ts": 6157602141243.893, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136968.727, "dur": 7.501, | |
"args": { | |
"External id": 35, "cbid": 307, "correlation": 119 | |
} | |
}, | |
{ | |
"ph": "s", "id": 119, "pid": 2537909, "tid": 2537909, "ts": 6157602136968.727, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6157602146984.218, "dur": 2.720, | |
"args": { | |
"External id": 36, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42 | |
} | |
}, | |
{ | |
"ph": "f", "id": 122, "pid": 0, "tid": 7, "ts": 6157602146984.218, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602136997.150, "dur": 5.207, | |
"args": { | |
"External id": 36, "cbid": 307, "correlation": 122 | |
} | |
}, | |
{ | |
"ph": "s", "id": 122, "pid": 2537909, "tid": 2537909, "ts": 6157602136997.150, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602146988.314, "dur": 0.800, | |
"args": { | |
"External id": 37, "device": 0, "context": 1, "stream": 7, "correlation": 137, "bytes": 4, "memory bandwidth (GB/s)": 0.005 | |
} | |
}, | |
{ | |
"ph": "f", "id": 137, "pid": 0, "tid": 7, "ts": 6157602146988.314, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137029.298, "dur": 3.656, | |
"args": { | |
"External id": 37, "cbid": 51, "correlation": 137 | |
} | |
}, | |
{ | |
"ph": "s", "id": 137, "pid": 2537909, "tid": 2537909, "ts": 6157602137029.298, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7, | |
"ts": 6157602146991.066, "dur": 3787.396, | |
"args": { | |
"External id": 37, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 138, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 138, "pid": 0, "tid": 7, "ts": 6157602146991.066, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137033.124, "dur": 5.728, | |
"args": { | |
"External id": 37, "cbid": 652, "correlation": 138 | |
} | |
}, | |
{ | |
"ph": "s", "id": 138, "pid": 2537909, "tid": 2537909, "ts": 6157602137033.124, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7, | |
"ts": 6157602150779.454, "dur": 1664.450, | |
"args": { | |
"External id": 38, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 149, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 149, "pid": 0, "tid": 7, "ts": 6157602150779.454, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137068.327, "dur": 5.538, | |
"args": { | |
"External id": 38, "cbid": 307, "correlation": 149 | |
} | |
}, | |
{ | |
"ph": "s", "id": 149, "pid": 2537909, "tid": 2537909, "ts": 6157602137068.327, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6157602152445.120, "dur": 74.944, | |
"args": { | |
"External id": 39, "device": 0, "context": 1, "stream": 7, "correlation": 156, "bytes": 77194752, "memory bandwidth (GB/s)": 1030.0324508966696 | |
} | |
}, | |
{ | |
"ph": "f", "id": 156, "pid": 0, "tid": 7, "ts": 6157602152445.120, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137108.678, "dur": 25.148, | |
"args": { | |
"External id": 39, "cbid": 41, "correlation": 156 | |
} | |
}, | |
{ | |
"ph": "s", "id": 156, "pid": 2537909, "tid": 2537909, "ts": 6157602137108.678, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137161.007, "dur": 2.614, | |
"args": { | |
"External id": 39, "cbid": 200, "correlation": 167 | |
} | |
}, | |
{ | |
"ph": "f", "id": 167, "pid": 2537909, "tid": 2537909, "ts": 6157602137161.007, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6157602152521.120, "dur": 5831.621, | |
"args": { | |
"External id": 39, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 170, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 170, "pid": 0, "tid": 7, "ts": 6157602152521.120, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137182.489, "dur": 5.418, | |
"args": { | |
"External id": 39, "cbid": 307, "correlation": 170 | |
} | |
}, | |
{ | |
"ph": "s", "id": 170, "pid": 2537909, "tid": 2537909, "ts": 6157602137182.489, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7, | |
"ts": 6157602158353.957, "dur": 2.720, | |
"args": { | |
"External id": 40, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 182, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 182, "pid": 0, "tid": 7, "ts": 6157602158353.957, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137229.139, "dur": 5.158, | |
"args": { | |
"External id": 40, "cbid": 307, "correlation": 182 | |
} | |
}, | |
{ | |
"ph": "s", "id": 182, "pid": 2537909, "tid": 2537909, "ts": 6157602137229.139, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7, | |
"ts": 6157602158358.661, "dur": 1.888, | |
"args": { | |
"External id": 41, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 187, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 187, "pid": 0, "tid": 7, "ts": 6157602158358.661, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137256.070, "dur": 4.136, | |
"args": { | |
"External id": 41, "cbid": 307, "correlation": 187 | |
} | |
}, | |
{ | |
"ph": "s", "id": 187, "pid": 2537909, "tid": 2537909, "ts": 6157602137256.070, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6157602158361.765, "dur": 1.376, | |
"args": { | |
"External id": 45, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 198, "pid": 0, "tid": 7, "ts": 6157602158361.765, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137433.678, "dur": 9.915, | |
"args": { | |
"External id": 45, "cbid": 211, "correlation": 198 | |
} | |
}, | |
{ | |
"ph": "s", "id": 198, "pid": 2537909, "tid": 2537909, "ts": 6157602137433.678, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602137482.932, "dur": 2.784, | |
"args": { | |
"External id": 3, "cbid": 135, "correlation": 206 | |
} | |
}, | |
{ | |
"ph": "f", "id": 206, "pid": 2537909, "tid": 2537909, "ts": 6157602137482.932, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6157602158364.325, "dur": 60.896, | |
"args": { | |
"External id": 515, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 215, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 215, "pid": 0, "tid": 7, "ts": 6157602158364.325, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138140.284, "dur": 43.816, | |
"args": { | |
"External id": 515, "cbid": 307, "correlation": 215 | |
} | |
}, | |
{ | |
"ph": "s", "id": 215, "pid": 2537909, "tid": 2544200, "ts": 6157602138140.284, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6157602158426.437, "dur": 2.304, | |
"args": { | |
"External id": 516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 219, "pid": 0, "tid": 7, "ts": 6157602158426.437, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138210.099, "dur": 6.019, | |
"args": { | |
"External id": 516, "cbid": 307, "correlation": 219 | |
} | |
}, | |
{ | |
"ph": "s", "id": 219, "pid": 2537909, "tid": 2544200, "ts": 6157602138210.099, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602158430.789, "dur": 47.840, | |
"args": { | |
"External id": 517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 223, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 223, "pid": 0, "tid": 7, "ts": 6157602158430.789, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138254.396, "dur": 5.518, | |
"args": { | |
"External id": 517, "cbid": 307, "correlation": 223 | |
} | |
}, | |
{ | |
"ph": "s", "id": 223, "pid": 2537909, "tid": 2544200, "ts": 6157602138254.396, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138323.530, "dur": 3.035, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 228 | |
} | |
}, | |
{ | |
"ph": "f", "id": 228, "pid": 2537909, "tid": 2544200, "ts": 6157602138323.530, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138332.423, "dur": 0.601, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 233 | |
} | |
}, | |
{ | |
"ph": "f", "id": 233, "pid": 2537909, "tid": 2544200, "ts": 6157602138332.423, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602138336.019, "dur": 0.360, | |
"args": { | |
"External id": 513, "cbid": 135, "correlation": 238 | |
} | |
}, | |
{ | |
"ph": "f", "id": 238, "pid": 2537909, "tid": 2544200, "ts": 6157602138336.019, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6157602158480.709, "dur": 7.648, | |
"args": { | |
"External id": 51, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 299, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 299, "pid": 0, "tid": 7, "ts": 6157602158480.709, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138696.823, "dur": 12.559, | |
"args": { | |
"External id": 51, "cbid": 307, "correlation": 299 | |
} | |
}, | |
{ | |
"ph": "s", "id": 299, "pid": 2537909, "tid": 2537909, "ts": 6157602138696.823, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6157602158489.541, "dur": 1.440, | |
"args": { | |
"External id": 52, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 307, "pid": 0, "tid": 7, "ts": 6157602158489.541, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138725.586, "dur": 4.107, | |
"args": { | |
"External id": 52, "cbid": 307, "correlation": 307 | |
} | |
}, | |
{ | |
"ph": "s", "id": 307, "pid": 2537909, "tid": 2537909, "ts": 6157602138725.586, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602158491.909, "dur": 48.480, | |
"args": { | |
"External id": 53, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 315, "pid": 0, "tid": 7, "ts": 6157602158491.909, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138743.313, "dur": 3.385, | |
"args": { | |
"External id": 53, "cbid": 307, "correlation": 315 | |
} | |
}, | |
{ | |
"ph": "s", "id": 315, "pid": 2537909, "tid": 2537909, "ts": 6157602138743.313, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6157602158542.470, "dur": 1.759, | |
"args": { | |
"External id": 74, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 322, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 322, "pid": 0, "tid": 7, "ts": 6157602158542.470, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138804.325, "dur": 4.998, | |
"args": { | |
"External id": 74, "cbid": 307, "correlation": 322 | |
} | |
}, | |
{ | |
"ph": "s", "id": 322, "pid": 2537909, "tid": 2537909, "ts": 6157602138804.325, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6157602158545.189, "dur": 32.160, | |
"args": { | |
"External id": 75, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 329, "pid": 0, "tid": 7, "ts": 6157602158545.189, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138820.650, "dur": 2.974, | |
"args": { | |
"External id": 75, "cbid": 307, "correlation": 329 | |
} | |
}, | |
{ | |
"ph": "s", "id": 329, "pid": 2537909, "tid": 2537909, "ts": 6157602138820.650, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6157602158578.277, "dur": 1.568, | |
"args": { | |
"External id": 76, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 336, "pid": 0, "tid": 7, "ts": 6157602158578.277, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138834.240, "dur": 2.664, | |
"args": { | |
"External id": 76, "cbid": 307, "correlation": 336 | |
} | |
}, | |
{ | |
"ph": "s", "id": 336, "pid": 2537909, "tid": 2537909, "ts": 6157602138834.240, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6157602158580.805, "dur": 108.577, | |
"args": { | |
"External id": 77, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 351, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 351, "pid": 0, "tid": 7, "ts": 6157602158580.805, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138867.721, "dur": 4.046, | |
"args": { | |
"External id": 77, "cbid": 307, "correlation": 351 | |
} | |
}, | |
{ | |
"ph": "s", "id": 351, "pid": 2537909, "tid": 2537909, "ts": 6157602138867.721, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602158691.014, "dur": 0.959, | |
"args": { | |
"External id": 78, "device": 0, "context": 1, "stream": 7, "correlation": 366, "bytes": 4, "memory bandwidth (GB/s)": 0.004171011470281543 | |
} | |
}, | |
{ | |
"ph": "f", "id": 366, "pid": 0, "tid": 7, "ts": 6157602158691.014, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138903.955, "dur": 5.008, | |
"args": { | |
"External id": 78, "cbid": 51, "correlation": 366 | |
} | |
}, | |
{ | |
"ph": "s", "id": 366, "pid": 2537909, "tid": 2537909, "ts": 6157602138903.955, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7, | |
"ts": 6157602158694.533, "dur": 4341.189, | |
"args": { | |
"External id": 78, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 367, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 367, "pid": 0, "tid": 7, "ts": 6157602158694.533, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138909.173, "dur": 5.038, | |
"args": { | |
"External id": 78, "cbid": 652, "correlation": 367 | |
} | |
}, | |
{ | |
"ph": "s", "id": 367, "pid": 2537909, "tid": 2537909, "ts": 6157602138909.173, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7, | |
"ts": 6157602163037.834, "dur": 5767.205, | |
"args": { | |
"External id": 80, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 388, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 388, "pid": 0, "tid": 7, "ts": 6157602163037.834, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138947.511, "dur": 4.487, | |
"args": { | |
"External id": 80, "cbid": 307, "correlation": 388 | |
} | |
}, | |
{ | |
"ph": "s", "id": 388, "pid": 2537909, "tid": 2537909, "ts": 6157602138947.511, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6157602168806.031, "dur": 2.720, | |
"args": { | |
"External id": 81, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42 | |
} | |
}, | |
{ | |
"ph": "f", "id": 391, "pid": 0, "tid": 7, "ts": 6157602168806.031, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138965.158, "dur": 2.994, | |
"args": { | |
"External id": 81, "cbid": 307, "correlation": 391 | |
} | |
}, | |
{ | |
"ph": "s", "id": 391, "pid": 2537909, "tid": 2537909, "ts": 6157602138965.158, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602168810.543, "dur": 0.992, | |
"args": { | |
"External id": 82, "device": 0, "context": 1, "stream": 7, "correlation": 406, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129 | |
} | |
}, | |
{ | |
"ph": "f", "id": 406, "pid": 0, "tid": 7, "ts": 6157602168810.543, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138987.571, "dur": 3.185, | |
"args": { | |
"External id": 82, "cbid": 51, "correlation": 406 | |
} | |
}, | |
{ | |
"ph": "s", "id": 406, "pid": 2537909, "tid": 2537909, "ts": 6157602138987.571, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7, | |
"ts": 6157602168813.007, "dur": 3761.860, | |
"args": { | |
"External id": 82, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 407, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 407, "pid": 0, "tid": 7, "ts": 6157602168813.007, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602138990.927, "dur": 3.996, | |
"args": { | |
"External id": 82, "cbid": 652, "correlation": 407 | |
} | |
}, | |
{ | |
"ph": "s", "id": 407, "pid": 2537909, "tid": 2537909, "ts": 6157602138990.927, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7, | |
"ts": 6157602172576.915, "dur": 1672.481, | |
"args": { | |
"External id": 83, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 418, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 418, "pid": 0, "tid": 7, "ts": 6157602172576.915, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139013.420, "dur": 3.265, | |
"args": { | |
"External id": 83, "cbid": 307, "correlation": 418 | |
} | |
}, | |
{ | |
"ph": "s", "id": 418, "pid": 2537909, "tid": 2537909, "ts": 6157602139013.420, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6157602174251.316, "dur": 75.648, | |
"args": { | |
"External id": 84, "device": 0, "context": 1, "stream": 7, "correlation": 425, "bytes": 77194752, "memory bandwidth (GB/s)": 1020.4467005076142 | |
} | |
}, | |
{ | |
"ph": "f", "id": 425, "pid": 0, "tid": 7, "ts": 6157602174251.316, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139031.107, "dur": 11.618, | |
"args": { | |
"External id": 84, "cbid": 41, "correlation": 425 | |
} | |
}, | |
{ | |
"ph": "s", "id": 425, "pid": 2537909, "tid": 2537909, "ts": 6157602139031.107, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139052.840, "dur": 0.731, | |
"args": { | |
"External id": 84, "cbid": 200, "correlation": 436 | |
} | |
}, | |
{ | |
"ph": "f", "id": 436, "pid": 2537909, "tid": 2537909, "ts": 6157602139052.840, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6157602174328.020, "dur": 5794.694, | |
"args": { | |
"External id": 84, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 439, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 439, "pid": 0, "tid": 7, "ts": 6157602174328.020, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139054.863, "dur": 3.255, | |
"args": { | |
"External id": 84, "cbid": 307, "correlation": 439 | |
} | |
}, | |
{ | |
"ph": "s", "id": 439, "pid": 2537909, "tid": 2537909, "ts": 6157602139054.863, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7, | |
"ts": 6157602180124.890, "dur": 2.848, | |
"args": { | |
"External id": 85, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 451, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 451, "pid": 0, "tid": 7, "ts": 6157602180124.890, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139084.618, "dur": 3.685, | |
"args": { | |
"External id": 85, "cbid": 307, "correlation": 451 | |
} | |
}, | |
{ | |
"ph": "s", "id": 451, "pid": 2537909, "tid": 2537909, "ts": 6157602139084.618, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7, | |
"ts": 6157602180128.794, "dur": 1.792, | |
"args": { | |
"External id": 86, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 456, "pid": 0, "tid": 7, "ts": 6157602180128.794, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139099.831, "dur": 2.854, | |
"args": { | |
"External id": 86, "cbid": 307, "correlation": 456 | |
} | |
}, | |
{ | |
"ph": "s", "id": 456, "pid": 2537909, "tid": 2537909, "ts": 6157602139099.831, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6157602180131.578, "dur": 1.504, | |
"args": { | |
"External id": 90, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 467, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 467, "pid": 0, "tid": 7, "ts": 6157602180131.578, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139173.402, "dur": 5.869, | |
"args": { | |
"External id": 90, "cbid": 211, "correlation": 467 | |
} | |
}, | |
{ | |
"ph": "s", "id": 467, "pid": 2537909, "tid": 2537909, "ts": 6157602139173.402, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139196.306, "dur": 1.112, | |
"args": { | |
"External id": 48, "cbid": 135, "correlation": 475 | |
} | |
}, | |
{ | |
"ph": "f", "id": 475, "pid": 2537909, "tid": 2537909, "ts": 6157602139196.306, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6157602180134.266, "dur": 60.928, | |
"args": { | |
"External id": 532, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 484, "pid": 0, "tid": 7, "ts": 6157602180134.266, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139332.101, "dur": 9.865, | |
"args": { | |
"External id": 532, "cbid": 307, "correlation": 484 | |
} | |
}, | |
{ | |
"ph": "s", "id": 484, "pid": 2537909, "tid": 2544200, "ts": 6157602139332.101, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6157602180197.210, "dur": 2.432, | |
"args": { | |
"External id": 533, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 488, "pid": 0, "tid": 7, "ts": 6157602180197.210, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139354.755, "dur": 3.496, | |
"args": { | |
"External id": 533, "cbid": 307, "correlation": 488 | |
} | |
}, | |
{ | |
"ph": "s", "id": 488, "pid": 2537909, "tid": 2544200, "ts": 6157602139354.755, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602180200.858, "dur": 47.776, | |
"args": { | |
"External id": 534, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 492, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 492, "pid": 0, "tid": 7, "ts": 6157602180200.858, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139372.262, "dur": 3.685, | |
"args": { | |
"External id": 534, "cbid": 307, "correlation": 492 | |
} | |
}, | |
{ | |
"ph": "s", "id": 492, "pid": 2537909, "tid": 2544200, "ts": 6157602139372.262, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139395.547, "dur": 1.011, | |
"args": { | |
"External id": 530, "cbid": 135, "correlation": 497 | |
} | |
}, | |
{ | |
"ph": "f", "id": 497, "pid": 2537909, "tid": 2544200, "ts": 6157602139395.547, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139399.743, "dur": 0.391, | |
"args": { | |
"External id": 530, "cbid": 135, "correlation": 502 | |
} | |
}, | |
{ | |
"ph": "f", "id": 502, "pid": 2537909, "tid": 2544200, "ts": 6157602139399.743, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602139402.627, "dur": 0.341, | |
"args": { | |
"External id": 530, "cbid": 135, "correlation": 507 | |
} | |
}, | |
{ | |
"ph": "f", "id": 507, "pid": 2537909, "tid": 2544200, "ts": 6157602139402.627, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6157602180249.850, "dur": 8.512, | |
"args": { | |
"External id": 96, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 568, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 568, "pid": 0, "tid": 7, "ts": 6157602180249.850, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139646.766, "dur": 11.156, | |
"args": { | |
"External id": 96, "cbid": 307, "correlation": 568 | |
} | |
}, | |
{ | |
"ph": "s", "id": 568, "pid": 2537909, "tid": 2537909, "ts": 6157602139646.766, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6157602180259.450, "dur": 1.920, | |
"args": { | |
"External id": 97, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 576, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 576, "pid": 0, "tid": 7, "ts": 6157602180259.450, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139671.313, "dur": 4.977, | |
"args": { | |
"External id": 97, "cbid": 307, "correlation": 576 | |
} | |
}, | |
{ | |
"ph": "s", "id": 576, "pid": 2537909, "tid": 2537909, "ts": 6157602139671.313, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602180263.514, "dur": 48.320, | |
"args": { | |
"External id": 98, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 584, "pid": 0, "tid": 7, "ts": 6157602180263.514, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139689.110, "dur": 3.345, | |
"args": { | |
"External id": 98, "cbid": 307, "correlation": 584 | |
} | |
}, | |
{ | |
"ph": "s", "id": 584, "pid": 2537909, "tid": 2537909, "ts": 6157602139689.110, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6157602180314.010, "dur": 1.632, | |
"args": { | |
"External id": 119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 591, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 591, "pid": 0, "tid": 7, "ts": 6157602180314.010, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139748.719, "dur": 4.407, | |
"args": { | |
"External id": 119, "cbid": 307, "correlation": 591 | |
} | |
}, | |
{ | |
"ph": "s", "id": 591, "pid": 2537909, "tid": 2537909, "ts": 6157602139748.719, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6157602180316.634, "dur": 31.776, | |
"args": { | |
"External id": 120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 598, "pid": 0, "tid": 7, "ts": 6157602180316.634, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139763.712, "dur": 3.425, | |
"args": { | |
"External id": 120, "cbid": 307, "correlation": 598 | |
} | |
}, | |
{ | |
"ph": "s", "id": 598, "pid": 2537909, "tid": 2537909, "ts": 6157602139763.712, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6157602180349.658, "dur": 1.792, | |
"args": { | |
"External id": 121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 605, "pid": 0, "tid": 7, "ts": 6157602180349.658, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139777.974, "dur": 2.954, | |
"args": { | |
"External id": 121, "cbid": 307, "correlation": 605 | |
} | |
}, | |
{ | |
"ph": "s", "id": 605, "pid": 2537909, "tid": 2537909, "ts": 6157602139777.974, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6157602180352.474, "dur": 108.384, | |
"args": { | |
"External id": 122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 620, "pid": 0, "tid": 7, "ts": 6157602180352.474, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139810.583, "dur": 3.345, | |
"args": { | |
"External id": 122, "cbid": 307, "correlation": 620 | |
} | |
}, | |
{ | |
"ph": "s", "id": 620, "pid": 2537909, "tid": 2537909, "ts": 6157602139810.583, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602180462.362, "dur": 1.184, | |
"args": { | |
"External id": 123, "device": 0, "context": 1, "stream": 7, "correlation": 635, "bytes": 4, "memory bandwidth (GB/s)": 0.0033783783783783786 | |
} | |
}, | |
{ | |
"ph": "f", "id": 635, "pid": 0, "tid": 7, "ts": 6157602180462.362, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139841.339, "dur": 4.547, | |
"args": { | |
"External id": 123, "cbid": 51, "correlation": 635 | |
} | |
}, | |
{ | |
"ph": "s", "id": 635, "pid": 2537909, "tid": 2537909, "ts": 6157602139841.339, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7, | |
"ts": 6157602180465.402, "dur": 4310.468, | |
"args": { | |
"External id": 123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 636, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 636, "pid": 0, "tid": 7, "ts": 6157602180465.402, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139846.126, "dur": 4.868, | |
"args": { | |
"External id": 123, "cbid": 652, "correlation": 636 | |
} | |
}, | |
{ | |
"ph": "s", "id": 636, "pid": 2537909, "tid": 2537909, "ts": 6157602139846.126, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7, | |
"ts": 6157602184776.926, "dur": 7076.519, | |
"args": { | |
"External id": 125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 657, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 657, "pid": 0, "tid": 7, "ts": 6157602184776.926, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139883.112, "dur": 4.267, | |
"args": { | |
"External id": 125, "cbid": 307, "correlation": 657 | |
} | |
}, | |
{ | |
"ph": "s", "id": 657, "pid": 2537909, "tid": 2537909, "ts": 6157602139883.112, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6157602191854.853, "dur": 2.752, | |
"args": { | |
"External id": 126, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42 | |
} | |
}, | |
{ | |
"ph": "f", "id": 660, "pid": 0, "tid": 7, "ts": 6157602191854.853, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139899.978, "dur": 3.234, | |
"args": { | |
"External id": 126, "cbid": 307, "correlation": 660 | |
} | |
}, | |
{ | |
"ph": "s", "id": 660, "pid": 2537909, "tid": 2537909, "ts": 6157602139899.978, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602191859.333, "dur": 0.992, | |
"args": { | |
"External id": 127, "device": 0, "context": 1, "stream": 7, "correlation": 675, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129 | |
} | |
}, | |
{ | |
"ph": "f", "id": 675, "pid": 0, "tid": 7, "ts": 6157602191859.333, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139921.139, "dur": 3.195, | |
"args": { | |
"External id": 127, "cbid": 51, "correlation": 675 | |
} | |
}, | |
{ | |
"ph": "s", "id": 675, "pid": 2537909, "tid": 2537909, "ts": 6157602139921.139, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7, | |
"ts": 6157602191863.045, "dur": 3698.628, | |
"args": { | |
"External id": 127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 676, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 676, "pid": 0, "tid": 7, "ts": 6157602191863.045, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139924.485, "dur": 3.805, | |
"args": { | |
"External id": 127, "cbid": 652, "correlation": 676 | |
} | |
}, | |
{ | |
"ph": "s", "id": 676, "pid": 2537909, "tid": 2537909, "ts": 6157602139924.485, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7, | |
"ts": 6157602195563.816, "dur": 1675.202, | |
"args": { | |
"External id": 128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 687, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 687, "pid": 0, "tid": 7, "ts": 6157602195563.816, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139946.428, "dur": 3.775, | |
"args": { | |
"External id": 128, "cbid": 307, "correlation": 687 | |
} | |
}, | |
{ | |
"ph": "s", "id": 687, "pid": 2537909, "tid": 2537909, "ts": 6157602139946.428, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6157602197240.074, "dur": 75.712, | |
"args": { | |
"External id": 129, "device": 0, "context": 1, "stream": 7, "correlation": 694, "bytes": 77194752, "memory bandwidth (GB/s)": 1019.5841081994928 | |
} | |
}, | |
{ | |
"ph": "f", "id": 694, "pid": 0, "tid": 7, "ts": 6157602197240.074, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139963.604, "dur": 10.826, | |
"args": { | |
"External id": 129, "cbid": 41, "correlation": 694 | |
} | |
}, | |
{ | |
"ph": "s", "id": 694, "pid": 2537909, "tid": 2537909, "ts": 6157602139963.604, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139984.475, "dur": 0.491, | |
"args": { | |
"External id": 129, "cbid": 200, "correlation": 705 | |
} | |
}, | |
{ | |
"ph": "f", "id": 705, "pid": 2537909, "tid": 2537909, "ts": 6157602139984.475, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6157602197317.994, "dur": 5983.974, | |
"args": { | |
"External id": 129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 708, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 708, "pid": 0, "tid": 7, "ts": 6157602197317.994, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602139986.348, "dur": 3.155, | |
"args": { | |
"External id": 129, "cbid": 307, "correlation": 708 | |
} | |
}, | |
{ | |
"ph": "s", "id": 708, "pid": 2537909, "tid": 2537909, "ts": 6157602139986.348, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7, | |
"ts": 6157602203304.208, "dur": 2.976, | |
"args": { | |
"External id": 130, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 720, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 720, "pid": 0, "tid": 7, "ts": 6157602203304.208, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140016.283, "dur": 3.345, | |
"args": { | |
"External id": 130, "cbid": 307, "correlation": 720 | |
} | |
}, | |
{ | |
"ph": "s", "id": 720, "pid": 2537909, "tid": 2537909, "ts": 6157602140016.283, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7, | |
"ts": 6157602203308.208, "dur": 1.920, | |
"args": { | |
"External id": 131, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 725, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 725, "pid": 0, "tid": 7, "ts": 6157602203308.208, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140030.364, "dur": 2.995, | |
"args": { | |
"External id": 131, "cbid": 307, "correlation": 725 | |
} | |
}, | |
{ | |
"ph": "s", "id": 725, "pid": 2537909, "tid": 2537909, "ts": 6157602140030.364, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6157602203311.280, "dur": 1.792, | |
"args": { | |
"External id": 135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 736, "pid": 0, "tid": 7, "ts": 6157602203311.280, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140099.168, "dur": 5.879, | |
"args": { | |
"External id": 135, "cbid": 211, "correlation": 736 | |
} | |
}, | |
{ | |
"ph": "s", "id": 736, "pid": 2537909, "tid": 2537909, "ts": 6157602140099.168, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140121.161, "dur": 1.132, | |
"args": { | |
"External id": 93, "cbid": 135, "correlation": 744 | |
} | |
}, | |
{ | |
"ph": "f", "id": 744, "pid": 2537909, "tid": 2537909, "ts": 6157602140121.161, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6157602203314.480, "dur": 61.280, | |
"args": { | |
"External id": 549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 753, "pid": 0, "tid": 7, "ts": 6157602203314.480, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140240.000, "dur": 9.655, | |
"args": { | |
"External id": 549, "cbid": 307, "correlation": 753 | |
} | |
}, | |
{ | |
"ph": "s", "id": 753, "pid": 2537909, "tid": 2544200, "ts": 6157602140240.000, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6157602203378.032, "dur": 2.496, | |
"args": { | |
"External id": 550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 757, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 757, "pid": 0, "tid": 7, "ts": 6157602203378.032, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140259.970, "dur": 2.935, | |
"args": { | |
"External id": 550, "cbid": 307, "correlation": 757 | |
} | |
}, | |
{ | |
"ph": "s", "id": 757, "pid": 2537909, "tid": 2544200, "ts": 6157602140259.970, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602203381.872, "dur": 48.128, | |
"args": { | |
"External id": 551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 761, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 761, "pid": 0, "tid": 7, "ts": 6157602203381.872, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140275.905, "dur": 2.503, | |
"args": { | |
"External id": 551, "cbid": 307, "correlation": 761 | |
} | |
}, | |
{ | |
"ph": "s", "id": 761, "pid": 2537909, "tid": 2544200, "ts": 6157602140275.905, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140312.069, "dur": 0.881, | |
"args": { | |
"External id": 547, "cbid": 135, "correlation": 766 | |
} | |
}, | |
{ | |
"ph": "f", "id": 766, "pid": 2537909, "tid": 2544200, "ts": 6157602140312.069, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140316.826, "dur": 0.581, | |
"args": { | |
"External id": 547, "cbid": 135, "correlation": 771 | |
} | |
}, | |
{ | |
"ph": "f", "id": 771, "pid": 2537909, "tid": 2544200, "ts": 6157602140316.826, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602140319.891, "dur": 0.340, | |
"args": { | |
"External id": 547, "cbid": 135, "correlation": 776 | |
} | |
}, | |
{ | |
"ph": "f", "id": 776, "pid": 2537909, "tid": 2544200, "ts": 6157602140319.891, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6157602203431.088, "dur": 8.832, | |
"args": { | |
"External id": 141, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 837, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 837, "pid": 0, "tid": 7, "ts": 6157602203431.088, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140556.227, "dur": 12.018, | |
"args": { | |
"External id": 141, "cbid": 307, "correlation": 837 | |
} | |
}, | |
{ | |
"ph": "s", "id": 837, "pid": 2537909, "tid": 2537909, "ts": 6157602140556.227, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6157602203442.160, "dur": 1.920, | |
"args": { | |
"External id": 142, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 845, "pid": 0, "tid": 7, "ts": 6157602203442.160, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140582.988, "dur": 4.196, | |
"args": { | |
"External id": 142, "cbid": 307, "correlation": 845 | |
} | |
}, | |
{ | |
"ph": "s", "id": 845, "pid": 2537909, "tid": 2537909, "ts": 6157602140582.988, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602203445.168, "dur": 48.704, | |
"args": { | |
"External id": 143, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 853, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 853, "pid": 0, "tid": 7, "ts": 6157602203445.168, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140599.773, "dur": 3.435, | |
"args": { | |
"External id": 143, "cbid": 307, "correlation": 853 | |
} | |
}, | |
{ | |
"ph": "s", "id": 853, "pid": 2537909, "tid": 2537909, "ts": 6157602140599.773, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6157602203494.960, "dur": 1.696, | |
"args": { | |
"External id": 164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 860, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 860, "pid": 0, "tid": 7, "ts": 6157602203494.960, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140657.470, "dur": 4.016, | |
"args": { | |
"External id": 164, "cbid": 307, "correlation": 860 | |
} | |
}, | |
{ | |
"ph": "s", "id": 860, "pid": 2537909, "tid": 2537909, "ts": 6157602140657.470, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6157602203497.808, "dur": 32.192, | |
"args": { | |
"External id": 165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 867, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 867, "pid": 0, "tid": 7, "ts": 6157602203497.808, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140672.212, "dur": 2.604, | |
"args": { | |
"External id": 165, "cbid": 307, "correlation": 867 | |
} | |
}, | |
{ | |
"ph": "s", "id": 867, "pid": 2537909, "tid": 2537909, "ts": 6157602140672.212, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6157602203532.304, "dur": 1.792, | |
"args": { | |
"External id": 166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 874, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 874, "pid": 0, "tid": 7, "ts": 6157602203532.304, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140685.292, "dur": 3.205, | |
"args": { | |
"External id": 166, "cbid": 307, "correlation": 874 | |
} | |
}, | |
{ | |
"ph": "s", "id": 874, "pid": 2537909, "tid": 2537909, "ts": 6157602140685.292, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6157602203535.408, "dur": 108.128, | |
"args": { | |
"External id": 167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 889, "pid": 0, "tid": 7, "ts": 6157602203535.408, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140718.222, "dur": 3.034, | |
"args": { | |
"External id": 167, "cbid": 307, "correlation": 889 | |
} | |
}, | |
{ | |
"ph": "s", "id": 889, "pid": 2537909, "tid": 2537909, "ts": 6157602140718.222, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602203645.104, "dur": 0.896, | |
"args": { | |
"External id": 168, "device": 0, "context": 1, "stream": 7, "correlation": 904, "bytes": 4, "memory bandwidth (GB/s)": 0.004464285714285714 | |
} | |
}, | |
{ | |
"ph": "f", "id": 904, "pid": 0, "tid": 7, "ts": 6157602203645.104, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140750.891, "dur": 4.887, | |
"args": { | |
"External id": 168, "cbid": 51, "correlation": 904 | |
} | |
}, | |
{ | |
"ph": "s", "id": 904, "pid": 2537909, "tid": 2537909, "ts": 6157602140750.891, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7, | |
"ts": 6157602203647.664, "dur": 4371.940, | |
"args": { | |
"External id": 168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 905, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 905, "pid": 0, "tid": 7, "ts": 6157602203647.664, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140755.999, "dur": 4.717, | |
"args": { | |
"External id": 168, "cbid": 652, "correlation": 905 | |
} | |
}, | |
{ | |
"ph": "s", "id": 905, "pid": 2537909, "tid": 2537909, "ts": 6157602140755.999, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7, | |
"ts": 6157602208020.980, "dur": 7215.591, | |
"args": { | |
"External id": 170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 926, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 926, "pid": 0, "tid": 7, "ts": 6157602208020.980, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140792.363, "dur": 4.337, | |
"args": { | |
"External id": 170, "cbid": 307, "correlation": 926 | |
} | |
}, | |
{ | |
"ph": "s", "id": 926, "pid": 2537909, "tid": 2537909, "ts": 6157602140792.363, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6157602215237.947, "dur": 2.848, | |
"args": { | |
"External id": 171, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 929, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42 | |
} | |
}, | |
{ | |
"ph": "f", "id": 929, "pid": 0, "tid": 7, "ts": 6157602215237.947, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140807.446, "dur": 2.804, | |
"args": { | |
"External id": 171, "cbid": 307, "correlation": 929 | |
} | |
}, | |
{ | |
"ph": "s", "id": 929, "pid": 2537909, "tid": 2537909, "ts": 6157602140807.446, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602215242.235, "dur": 1.184, | |
"args": { | |
"External id": 172, "device": 0, "context": 1, "stream": 7, "correlation": 944, "bytes": 4, "memory bandwidth (GB/s)": 0.0033783783783783786 | |
} | |
}, | |
{ | |
"ph": "f", "id": 944, "pid": 0, "tid": 7, "ts": 6157602215242.235, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140828.949, "dur": 3.104, | |
"args": { | |
"External id": 172, "cbid": 51, "correlation": 944 | |
} | |
}, | |
{ | |
"ph": "s", "id": 944, "pid": 2537909, "tid": 2537909, "ts": 6157602140828.949, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7, | |
"ts": 6157602215245.531, "dur": 3707.620, | |
"args": { | |
"External id": 172, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 945, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 945, "pid": 0, "tid": 7, "ts": 6157602215245.531, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140832.214, "dur": 3.595, | |
"args": { | |
"External id": 172, "cbid": 652, "correlation": 945 | |
} | |
}, | |
{ | |
"ph": "s", "id": 945, "pid": 2537909, "tid": 2537909, "ts": 6157602140832.214, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7, | |
"ts": 6157602218954.526, "dur": 1679.490, | |
"args": { | |
"External id": 173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 956, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 956, "pid": 0, "tid": 7, "ts": 6157602218954.526, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140853.225, "dur": 3.015, | |
"args": { | |
"External id": 173, "cbid": 307, "correlation": 956 | |
} | |
}, | |
{ | |
"ph": "s", "id": 956, "pid": 2537909, "tid": 2537909, "ts": 6157602140853.225, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6157602220635.200, "dur": 77.344, | |
"args": { | |
"External id": 174, "device": 0, "context": 1, "stream": 7, "correlation": 963, "bytes": 77194752, "memory bandwidth (GB/s)": 998.0703351261895 | |
} | |
}, | |
{ | |
"ph": "f", "id": 963, "pid": 0, "tid": 7, "ts": 6157602220635.200, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140869.620, "dur": 11.187, | |
"args": { | |
"External id": 174, "cbid": 41, "correlation": 963 | |
} | |
}, | |
{ | |
"ph": "s", "id": 963, "pid": 2537909, "tid": 2537909, "ts": 6157602140869.620, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140890.501, "dur": 0.461, | |
"args": { | |
"External id": 174, "cbid": 200, "correlation": 974 | |
} | |
}, | |
{ | |
"ph": "f", "id": 974, "pid": 2537909, "tid": 2537909, "ts": 6157602140890.501, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6157602220713.632, "dur": 6128.294, | |
"args": { | |
"External id": 174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 977, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 977, "pid": 0, "tid": 7, "ts": 6157602220713.632, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140892.184, "dur": 3.335, | |
"args": { | |
"External id": 174, "cbid": 307, "correlation": 977 | |
} | |
}, | |
{ | |
"ph": "s", "id": 977, "pid": 2537909, "tid": 2537909, "ts": 6157602140892.184, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7, | |
"ts": 6157602226843.302, "dur": 2.880, | |
"args": { | |
"External id": 175, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 989, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 989, "pid": 0, "tid": 7, "ts": 6157602226843.302, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140921.148, "dur": 3.916, | |
"args": { | |
"External id": 175, "cbid": 307, "correlation": 989 | |
} | |
}, | |
{ | |
"ph": "s", "id": 989, "pid": 2537909, "tid": 2537909, "ts": 6157602140921.148, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7, | |
"ts": 6157602226847.270, "dur": 2.304, | |
"args": { | |
"External id": 176, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 994, "pid": 0, "tid": 7, "ts": 6157602226847.270, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602140936.631, "dur": 2.874, | |
"args": { | |
"External id": 176, "cbid": 307, "correlation": 994 | |
} | |
}, | |
{ | |
"ph": "s", "id": 994, "pid": 2537909, "tid": 2537909, "ts": 6157602140936.631, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6157602226851.686, "dur": 1.856, | |
"args": { | |
"External id": 180, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1005, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1005, "pid": 0, "tid": 7, "ts": 6157602226851.686, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141006.136, "dur": 6.179, | |
"args": { | |
"External id": 180, "cbid": 211, "correlation": 1005 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1005, "pid": 2537909, "tid": 2537909, "ts": 6157602141006.136, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141028.950, "dur": 1.112, | |
"args": { | |
"External id": 138, "cbid": 135, "correlation": 1013 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1013, "pid": 2537909, "tid": 2537909, "ts": 6157602141028.950, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6157602226854.886, "dur": 61.344, | |
"args": { | |
"External id": 566, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1022, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1022, "pid": 0, "tid": 7, "ts": 6157602226854.886, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141147.840, "dur": 8.873, | |
"args": { | |
"External id": 566, "cbid": 307, "correlation": 1022 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1022, "pid": 2537909, "tid": 2544200, "ts": 6157602141147.840, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6157602226917.318, "dur": 2.560, | |
"args": { | |
"External id": 567, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1026, "pid": 0, "tid": 7, "ts": 6157602226917.318, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141167.900, "dur": 2.914, | |
"args": { | |
"External id": 567, "cbid": 307, "correlation": 1026 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1026, "pid": 2537909, "tid": 2544200, "ts": 6157602141167.900, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602226921.286, "dur": 47.936, | |
"args": { | |
"External id": 568, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1030, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1030, "pid": 0, "tid": 7, "ts": 6157602226921.286, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141184.275, "dur": 2.664, | |
"args": { | |
"External id": 568, "cbid": 307, "correlation": 1030 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1030, "pid": 2537909, "tid": 2544200, "ts": 6157602141184.275, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141205.527, "dur": 1.121, | |
"args": { | |
"External id": 564, "cbid": 135, "correlation": 1035 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1035, "pid": 2537909, "tid": 2544200, "ts": 6157602141205.527, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141208.832, "dur": 0.390, | |
"args": { | |
"External id": 564, "cbid": 135, "correlation": 1040 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1040, "pid": 2537909, "tid": 2544200, "ts": 6157602141208.832, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602141211.566, "dur": 0.330, | |
"args": { | |
"External id": 564, "cbid": 135, "correlation": 1045 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1045, "pid": 2537909, "tid": 2544200, "ts": 6157602141211.566, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7, | |
"ts": 6157602226971.430, "dur": 9.376, | |
"args": { | |
"External id": 186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1106, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1106, "pid": 0, "tid": 7, "ts": 6157602226971.430, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141454.782, "dur": 11.698, | |
"args": { | |
"External id": 186, "cbid": 307, "correlation": 1106 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1106, "pid": 2537909, "tid": 2537909, "ts": 6157602141454.782, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7, | |
"ts": 6157602226982.182, "dur": 1.728, | |
"args": { | |
"External id": 187, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1114, "pid": 0, "tid": 7, "ts": 6157602226982.182, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141481.162, "dur": 3.866, | |
"args": { | |
"External id": 187, "cbid": 307, "correlation": 1114 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1114, "pid": 2537909, "tid": 2537909, "ts": 6157602141481.162, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602226985.254, "dur": 48.608, | |
"args": { | |
"External id": 188, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1122, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1122, "pid": 0, "tid": 7, "ts": 6157602226985.254, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141498.779, "dur": 3.014, | |
"args": { | |
"External id": 188, "cbid": 307, "correlation": 1122 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1122, "pid": 2537909, "tid": 2537909, "ts": 6157602141498.779, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7, | |
"ts": 6157602227035.206, "dur": 1.920, | |
"args": { | |
"External id": 209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1129, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1129, "pid": 0, "tid": 7, "ts": 6157602227035.206, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141557.127, "dur": 4.747, | |
"args": { | |
"External id": 209, "cbid": 307, "correlation": 1129 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1129, "pid": 2537909, "tid": 2537909, "ts": 6157602141557.127, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7, | |
"ts": 6157602227039.238, "dur": 32.096, | |
"args": { | |
"External id": 210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1136, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1136, "pid": 0, "tid": 7, "ts": 6157602227039.238, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141573.461, "dur": 3.756, | |
"args": { | |
"External id": 210, "cbid": 307, "correlation": 1136 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1136, "pid": 2537909, "tid": 2537909, "ts": 6157602141573.461, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7, | |
"ts": 6157602227073.606, "dur": 1.760, | |
"args": { | |
"External id": 211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1143, "pid": 0, "tid": 7, "ts": 6157602227073.606, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141586.631, "dur": 2.604, | |
"args": { | |
"External id": 211, "cbid": 307, "correlation": 1143 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1143, "pid": 2537909, "tid": 2537909, "ts": 6157602141586.631, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_addmm_mm_nll_loss_forward_6", "pid": 0, "tid": 7, | |
"ts": 6157602227076.390, "dur": 108.960, | |
"args": { | |
"External id": 212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.590912, "warps per SM": 1142.363647, "grid": [37698, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1158, "pid": 0, "tid": 7, "ts": 6157602227076.390, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141619.090, "dur": 3.355, | |
"args": { | |
"External id": 212, "cbid": 307, "correlation": 1158 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1158, "pid": 2537909, "tid": 2537909, "ts": 6157602141619.090, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602227187.270, "dur": 1.120, | |
"args": { | |
"External id": 213, "device": 0, "context": 1, "stream": 7, "correlation": 1173, "bytes": 4, "memory bandwidth (GB/s)": 0.0035714285714285713 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1173, "pid": 0, "tid": 7, "ts": 6157602227187.270, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141649.426, "dur": 4.577, | |
"args": { | |
"External id": 213, "cbid": 51, "correlation": 1173 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1173, "pid": 2537909, "tid": 2537909, "ts": 6157602141649.426, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_208x192_64x4_2x1_v_bz_coopB_TNN", "pid": 0, "tid": 7, | |
"ts": 6157602227190.086, "dur": 4370.341, | |
"args": { | |
"External id": 213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1174, "registers per thread": 168, "shared memory": 221340, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1174, "pid": 0, "tid": 7, "ts": 6157602227190.086, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141654.243, "dur": 4.607, | |
"args": { | |
"External id": 213, "cbid": 652, "correlation": 1174 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1174, "pid": 2537909, "tid": 2537909, "ts": 6157602141654.243, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_nll_loss_backward_nll_loss_forward_7", "pid": 0, "tid": 7, | |
"ts": 6157602231562.763, "dur": 7216.518, | |
"args": { | |
"External id": 215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1195, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1195, "pid": 0, "tid": 7, "ts": 6157602231562.763, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141691.439, "dur": 4.998, | |
"args": { | |
"External id": 215, "cbid": 307, "correlation": 1195 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1195, "pid": 2537909, "tid": 2537909, "ts": 6157602141691.439, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused__log_softmax_backward_data_mm_nll_loss_forward_8", "pid": 0, "tid": 7, | |
"ts": 6157602238781.425, "dur": 2.880, | |
"args": { | |
"External id": 216, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 3.393939, "warps per SM": 27.151516, "grid": [448, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 42 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1198, "pid": 0, "tid": 7, "ts": 6157602238781.425, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141707.954, "dur": 3.035, | |
"args": { | |
"External id": 216, "cbid": 307, "correlation": 1198 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1198, "pid": 2537909, "tid": 2537909, "ts": 6157602141707.954, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, | |
"ts": 6157602238786.161, "dur": 0.896, | |
"args": { | |
"External id": 217, "device": 0, "context": 1, "stream": 7, "correlation": 1213, "bytes": 4, "memory bandwidth (GB/s)": 0.004464285714285714 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1213, "pid": 0, "tid": 7, "ts": 6157602238786.161, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141729.467, "dur": 2.964, | |
"args": { | |
"External id": 217, "cbid": 51, "correlation": 1213 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1213, "pid": 2537909, "tid": 2537909, "ts": 6157602141729.467, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "nvjet_tst_256x128_64x4_1x2_h_bz_coopA_NNT", "pid": 0, "tid": 7, | |
"ts": 6157602238789.233, "dur": 3724.740, | |
"args": { | |
"External id": 217, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1214, "registers per thread": 168, "shared memory": 213148, "blocks per SM": 1.000000, "warps per SM": 12.000000, "grid": [2, 66, 1], "block": [384, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1214, "pid": 0, "tid": 7, "ts": 6157602238789.233, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernelEx", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141732.581, "dur": 3.876, | |
"args": { | |
"External id": 217, "cbid": 652, "correlation": 1214 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1214, "pid": 2537909, "tid": 2537909, "ts": 6157602141732.581, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_9", "pid": 0, "tid": 7, | |
"ts": 6157602242514.997, "dur": 1677.441, | |
"args": { | |
"External id": 218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1225, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1225, "pid": 0, "tid": 7, "ts": 6157602242514.997, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141753.703, "dur": 3.075, | |
"args": { | |
"External id": 218, "cbid": 307, "correlation": 1225 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1225, "pid": 2537909, "tid": 2537909, "ts": 6157602141753.703, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, | |
"ts": 6157602244194.678, "dur": 77.313, | |
"args": { | |
"External id": 219, "device": 0, "context": 1, "stream": 7, "correlation": 1232, "bytes": 77194752, "memory bandwidth (GB/s)": 998.4705288890614 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1232, "pid": 0, "tid": 7, "ts": 6157602244194.678, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141769.998, "dur": 10.736, | |
"args": { | |
"External id": 219, "cbid": 41, "correlation": 1232 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1232, "pid": 2537909, "tid": 2537909, "ts": 6157602141769.998, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141790.719, "dur": 0.461, | |
"args": { | |
"External id": 219, "cbid": 200, "correlation": 1243 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1243, "pid": 2537909, "tid": 2537909, "ts": 6157602141790.719, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7, | |
"ts": 6157602244273.143, "dur": 6101.189, | |
"args": { | |
"External id": 219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1246, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1246, "pid": 0, "tid": 7, "ts": 6157602244273.143, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141792.382, "dur": 3.054, | |
"args": { | |
"External id": 219, "cbid": 307, "correlation": 1246 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1246, "pid": 2537909, "tid": 2537909, "ts": 6157602141792.382, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_10", "pid": 0, "tid": 7, | |
"ts": 6157602250375.484, "dur": 3.136, | |
"args": { | |
"External id": 220, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1258, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1258, "pid": 0, "tid": 7, "ts": 6157602250375.484, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141822.637, "dur": 3.435, | |
"args": { | |
"External id": 220, "cbid": 307, "correlation": 1258 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1258, "pid": 2537909, "tid": 2537909, "ts": 6157602141822.637, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_11", "pid": 0, "tid": 7, | |
"ts": 6157602250380.860, "dur": 2.144, | |
"args": { | |
"External id": 221, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1263, "pid": 0, "tid": 7, "ts": 6157602250380.860, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141838.151, "dur": 2.794, | |
"args": { | |
"External id": 221, "cbid": 307, "correlation": 1263 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1263, "pid": 2537909, "tid": 2537909, "ts": 6157602141838.151, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7, | |
"ts": 6157602250384.188, "dur": 1.632, | |
"args": { | |
"External id": 225, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1274, "pid": 0, "tid": 7, "ts": 6157602250384.188, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141910.049, "dur": 5.619, | |
"args": { | |
"External id": 225, "cbid": 211, "correlation": 1274 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1274, "pid": 2537909, "tid": 2537909, "ts": 6157602141910.049, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602141932.203, "dur": 1.081, | |
"args": { | |
"External id": 183, "cbid": 135, "correlation": 1282 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1282, "pid": 2537909, "tid": 2537909, "ts": 6157602141932.203, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7, | |
"ts": 6157602250387.164, "dur": 61.504, | |
"args": { | |
"External id": 583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1291, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1291, "pid": 0, "tid": 7, "ts": 6157602250387.164, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142051.733, "dur": 8.933, | |
"args": { | |
"External id": 583, "cbid": 307, "correlation": 1291 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1291, "pid": 2537909, "tid": 2544200, "ts": 6157602142051.733, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7, | |
"ts": 6157602250449.724, "dur": 2.560, | |
"args": { | |
"External id": 584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1295, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1295, "pid": 0, "tid": 7, "ts": 6157602250449.724, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142071.613, "dur": 3.064, | |
"args": { | |
"External id": 584, "cbid": 307, "correlation": 1295 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1295, "pid": 2537909, "tid": 2544200, "ts": 6157602142071.613, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7, | |
"ts": 6157602250453.340, "dur": 48.449, | |
"args": { | |
"External id": 585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1299, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1299, "pid": 0, "tid": 7, "ts": 6157602250453.340, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142087.627, "dur": 2.864, | |
"args": { | |
"External id": 585, "cbid": 307, "correlation": 1299 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1299, "pid": 2537909, "tid": 2544200, "ts": 6157602142087.627, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142107.988, "dur": 0.841, | |
"args": { | |
"External id": 581, "cbid": 135, "correlation": 1304 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1304, "pid": 2537909, "tid": 2544200, "ts": 6157602142107.988, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142111.503, "dur": 0.551, | |
"args": { | |
"External id": 581, "cbid": 135, "correlation": 1309 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1309, "pid": 2537909, "tid": 2544200, "ts": 6157602142111.503, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 2537909, "tid": 2544200, | |
"ts": 6157602142114.377, "dur": 0.341, | |
"args": { | |
"External id": 581, "cbid": 135, "correlation": 1314 | |
} | |
}, | |
{ | |
"ph": "f", "id": 1314, "pid": 2537909, "tid": 2544200, "ts": 6157602142114.377, | |
"cat": "ac2g", "name": "ac2g", "bp": "e" | |
}, | |
{ | |
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 2537909, "tid": 2537909, | |
"ts": 6157602142265.716, "dur": 108250.600, | |
"args": { | |
"cbid": 165, "correlation": 1348 | |
} | |
}, | |
{ | |
"ph": "s", "id": 1348, "pid": 2537909, "tid": 2537909, "ts": 6157602142265.716, | |
"cat": "ac2g", "name": "ac2g" | |
}, | |
{ | |
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 4", "pid": 0, "tid": 7, | |
"ts": 6157602226971.429, "dur": 23414.392, | |
"args": { | |
"External id": 181 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 3", "pid": 0, "tid": 7, | |
"ts": 6157602203431.087, "dur": 23422.456, | |
"args": { | |
"External id": 136 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 2", "pid": 0, "tid": 7, | |
"ts": 6157602180249.849, "dur": 23063.224, | |
"args": { | |
"External id": 91 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 1", "pid": 0, "tid": 7, | |
"ts": 6157602158480.708, "dur": 21652.375, | |
"args": { | |
"External id": 46 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 0", "pid": 0, "tid": 7, | |
"ts": 6157602136504.304, "dur": 21858.838, | |
"args": { | |
"External id": 1 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0, | |
"args": { | |
"labels": "CPU" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 0, | |
"args": { | |
"sort_index": 2537909 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0, | |
"args": { | |
"labels": "GPU 0" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 0, | |
"args": { | |
"sort_index": 5000000 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0, | |
"args": { | |
"labels": "GPU 1" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 1, "tid": 0, | |
"args": { | |
"sort_index": 5000001 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0, | |
"args": { | |
"labels": "GPU 2" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2, "tid": 0, | |
"args": { | |
"sort_index": 5000002 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0, | |
"args": { | |
"labels": "GPU 3" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 3, "tid": 0, | |
"args": { | |
"sort_index": 5000003 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0, | |
"args": { | |
"labels": "GPU 4" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 4, "tid": 0, | |
"args": { | |
"sort_index": 5000004 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0, | |
"args": { | |
"labels": "GPU 5" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 5, "tid": 0, | |
"args": { | |
"sort_index": 5000005 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0, | |
"args": { | |
"labels": "GPU 6" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 6, "tid": 0, | |
"args": { | |
"sort_index": 5000006 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0, | |
"args": { | |
"labels": "GPU 7" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 7, "tid": 0, | |
"args": { | |
"sort_index": 5000007 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0, | |
"args": { | |
"labels": "GPU 8" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 8, "tid": 0, | |
"args": { | |
"sort_index": 5000008 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0, | |
"args": { | |
"labels": "GPU 9" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 9, "tid": 0, | |
"args": { | |
"sort_index": 5000009 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0, | |
"args": { | |
"labels": "GPU 10" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 10, "tid": 0, | |
"args": { | |
"sort_index": 5000010 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0, | |
"args": { | |
"labels": "GPU 11" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 11, "tid": 0, | |
"args": { | |
"sort_index": 5000011 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0, | |
"args": { | |
"labels": "GPU 12" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 12, "tid": 0, | |
"args": { | |
"sort_index": 5000012 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0, | |
"args": { | |
"labels": "GPU 13" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 13, "tid": 0, | |
"args": { | |
"sort_index": 5000013 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0, | |
"args": { | |
"labels": "GPU 14" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 14, "tid": 0, | |
"args": { | |
"sort_index": 5000014 | |
} | |
}, | |
{ | |
"name": "process_name", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0, | |
"args": { | |
"name": "python" | |
} | |
}, | |
{ | |
"name": "process_labels", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0, | |
"args": { | |
"labels": "GPU 15" | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 15, "tid": 0, | |
"args": { | |
"sort_index": 5000015 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 7, | |
"args": { | |
"name": "stream 7 " | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 0, "tid": 7, | |
"args": { | |
"sort_index": 7 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200, | |
"args": { | |
"name": "thread 2544200 (pt_autograd_0)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200, | |
"args": { | |
"sort_index": 2544200 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200, | |
"args": { | |
"name": "thread 2544200 (python)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2544200, | |
"args": { | |
"sort_index": 2544200 | |
} | |
}, | |
{ | |
"name": "thread_name", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2537909, | |
"args": { | |
"name": "thread 2537909 (python)" | |
} | |
}, | |
{ | |
"name": "thread_sort_index", "ph": "M", "ts": 6157602134683.368, "pid": 2537909, "tid": 2537909, | |
"args": { | |
"sort_index": 2537909 | |
} | |
}, | |
{ | |
"ph": "X", "cat": "Trace", "ts": 6157602134629.026, "dur": 115917.113, | |
"pid": "Spans", "tid": "PyTorch Profiler", | |
"name": "PyTorch Profiler (0)", | |
"args": { | |
"Op count": 0 | |
} | |
}, | |
{ | |
"name": "process_sort_index", "ph": "M", "ts": 6157602134629.026, | |
"pid": "Spans", "tid": 0, | |
"args": { | |
"sort_index": 536870912 | |
} | |
}, | |
{ | |
"name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", | |
"pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6157602134629.026 | |
}, | |
{ | |
"name": "Record Window End", "ph": "i", "s": "g", | |
"pid": "", "tid": "", "ts": 6157602250920.914 | |
} | |
], | |
"traceName": "/tmp/trace.json" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment