Skip to content

Instantly share code, notes, and snippets.

@shunting314
Created June 11, 2025 00:15
Show Gist options
  • Save shunting314/8bf1bd3256f7a64d7aa7e25caade0628 to your computer and use it in GitHub Desktop.
Save shunting314/8bf1bd3256f7a64d7aa7e25caade0628 to your computer and use it in GitHub Desktop.
{
"schemaVersion": 1,
"deviceProperties": [
{
"id": 0, "name": "NVIDIA H100", "totalGlobalMem": 102010781696,
"computeMajor": 9, "computeMinor": 0,
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
"regsPerBlock": 65536, "warpSize": 32,
"sharedMemPerBlock": 49152, "numSms": 132
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
}
],
"cupti_version": 24,
"cuda_runtime_version": 12060,
"cuda_driver_version": 12020,
"trace_id": "3AC401439E8F44E3B20981F5147F02AD",
"displayTimeUnit": "ms",
"baseTimeNanoseconds": 1743521598000000000,
"traceEvents": [
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338127631.521, "dur": 538.702,
"args": {
"External id": 513,"Record function id": 0, "Sequence number": 127, "Fwd thread id": 1, "Ev Idx": 0
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338127645.873, "dur": 484.871,
"args": {
"External id": 514,"Record function id": 0, "Sequence number": 127, "Fwd thread id": 1, "Ev Idx": 1
}
},
{
"ph": "f", "id": 1, "pid": 3416838, "tid": 3420252, "ts": 6079338127645.873,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 3416838, "tid": 3420252,
"ts": 6079338127948.149, "dur": 83.616,
"args": {
"External id": 515,"Record function id": 0, "Ev Idx": 2
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 3416838, "tid": 3420252,
"ts": 6079338128048.059, "dur": 14.392,
"args": {
"External id": 516,"Record function id": 0, "Ev Idx": 3
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3420252,
"ts": 6079338128090.694, "dur": 12.619,
"args": {
"External id": 517,"Record function id": 0, "Ev Idx": 4
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128190.264, "dur": 27.000,
"args": {
"External id": 518,"Record function id": 0, "Ev Idx": 5
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128193.078, "dur": 22.043,
"args": {
"External id": 519,"Record function id": 0, "Ev Idx": 6
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128202.662, "dur": 10.666,
"args": {
"External id": 520,"Record function id": 0, "Ev Idx": 7
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128204.936, "dur": 8.222,
"args": {
"External id": 521,"Record function id": 0, "Ev Idx": 8
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128221.901, "dur": 4.878,
"args": {
"External id": 522,"Record function id": 0, "Ev Idx": 9
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128223.654, "dur": 2.354,
"args": {
"External id": 523,"Record function id": 0, "Ev Idx": 10
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128224.706, "dur": 0.901,
"args": {
"External id": 524,"Record function id": 0, "Ev Idx": 11
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128224.946, "dur": 0.531,
"args": {
"External id": 525,"Record function id": 0, "Ev Idx": 12
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128230.745, "dur": 4.146,
"args": {
"External id": 526,"Record function id": 0, "Ev Idx": 13
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338128232.017, "dur": 2.143,
"args": {
"External id": 527,"Record function id": 0, "Ev Idx": 14
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128232.978, "dur": 0.831,
"args": {
"External id": 528,"Record function id": 0, "Ev Idx": 15
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338128233.158, "dur": 0.551,
"args": {
"External id": 529,"Record function id": 0, "Ev Idx": 16
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338128994.396, "dur": 191.359,
"args": {
"External id": 530,"Record function id": 0, "Sequence number": 128, "Fwd thread id": 1, "Ev Idx": 17
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338128997.421, "dur": 172.500,
"args": {
"External id": 531,"Record function id": 0, "Sequence number": 128, "Fwd thread id": 1, "Ev Idx": 18
}
},
{
"ph": "f", "id": 2, "pid": 3416838, "tid": 3420252, "ts": 6079338128997.421,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 3416838, "tid": 3420252,
"ts": 6079338129098.373, "dur": 24.006,
"args": {
"External id": 532,"Record function id": 0, "Ev Idx": 19
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 3416838, "tid": 3420252,
"ts": 6079338129129.299, "dur": 9.094,
"args": {
"External id": 533,"Record function id": 0, "Ev Idx": 20
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3420252,
"ts": 6079338129148.188, "dur": 8.563,
"args": {
"External id": 534,"Record function id": 0, "Ev Idx": 21
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129195.830, "dur": 7.792,
"args": {
"External id": 535,"Record function id": 0, "Ev Idx": 22
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129197.512, "dur": 5.068,
"args": {
"External id": 536,"Record function id": 0, "Ev Idx": 23
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129199.285, "dur": 2.614,
"args": {
"External id": 537,"Record function id": 0, "Ev Idx": 24
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129199.806, "dur": 1.953,
"args": {
"External id": 538,"Record function id": 0, "Ev Idx": 25
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129207.918, "dur": 4.307,
"args": {
"External id": 539,"Record function id": 0, "Ev Idx": 26
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129209.360, "dur": 2.133,
"args": {
"External id": 540,"Record function id": 0, "Ev Idx": 27
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129210.151, "dur": 1.012,
"args": {
"External id": 541,"Record function id": 0, "Ev Idx": 28
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129210.372, "dur": 0.681,
"args": {
"External id": 542,"Record function id": 0, "Ev Idx": 29
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129215.960, "dur": 3.666,
"args": {
"External id": 543,"Record function id": 0, "Ev Idx": 30
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338129217.072, "dur": 1.823,
"args": {
"External id": 544,"Record function id": 0, "Ev Idx": 31
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129217.773, "dur": 0.821,
"args": {
"External id": 545,"Record function id": 0, "Ev Idx": 32
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338129217.943, "dur": 0.551,
"args": {
"External id": 546,"Record function id": 0, "Ev Idx": 33
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338129870.026, "dur": 178.059,
"args": {
"External id": 547,"Record function id": 0, "Sequence number": 129, "Fwd thread id": 1, "Ev Idx": 34
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338129873.251, "dur": 159.981,
"args": {
"External id": 548,"Record function id": 0, "Sequence number": 129, "Fwd thread id": 1, "Ev Idx": 35
}
},
{
"ph": "f", "id": 3, "pid": 3416838, "tid": 3420252, "ts": 6079338129873.251,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 3416838, "tid": 3420252,
"ts": 6079338129967.714, "dur": 20.981,
"args": {
"External id": 549,"Record function id": 0, "Ev Idx": 36
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 3416838, "tid": 3420252,
"ts": 6079338129995.225, "dur": 9.364,
"args": {
"External id": 550,"Record function id": 0, "Ev Idx": 37
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3420252,
"ts": 6079338130013.162, "dur": 7.041,
"args": {
"External id": 551,"Record function id": 0, "Ev Idx": 38
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130057.078, "dur": 7.171,
"args": {
"External id": 552,"Record function id": 0, "Ev Idx": 39
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130058.811, "dur": 4.326,
"args": {
"External id": 553,"Record function id": 0, "Ev Idx": 40
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130060.213, "dur": 2.314,
"args": {
"External id": 554,"Record function id": 0, "Ev Idx": 41
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130060.704, "dur": 1.682,
"args": {
"External id": 555,"Record function id": 0, "Ev Idx": 42
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130068.616, "dur": 4.036,
"args": {
"External id": 556,"Record function id": 0, "Ev Idx": 43
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130069.838, "dur": 2.083,
"args": {
"External id": 557,"Record function id": 0, "Ev Idx": 44
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130070.619, "dur": 0.971,
"args": {
"External id": 558,"Record function id": 0, "Ev Idx": 45
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130070.799, "dur": 0.681,
"args": {
"External id": 559,"Record function id": 0, "Ev Idx": 46
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130076.387, "dur": 3.596,
"args": {
"External id": 560,"Record function id": 0, "Ev Idx": 47
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130077.449, "dur": 1.853,
"args": {
"External id": 561,"Record function id": 0, "Ev Idx": 48
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130078.140, "dur": 0.841,
"args": {
"External id": 562,"Record function id": 0, "Ev Idx": 49
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130078.340, "dur": 0.551,
"args": {
"External id": 563,"Record function id": 0, "Ev Idx": 50
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338130731.285, "dur": 175.034,
"args": {
"External id": 564,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 51
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338130734.560, "dur": 157.577,
"args": {
"External id": 565,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 1, "Ev Idx": 52
}
},
{
"ph": "f", "id": 4, "pid": 3416838, "tid": 3420252, "ts": 6079338130734.560,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 3416838, "tid": 3420252,
"ts": 6079338130827.169, "dur": 20.702,
"args": {
"External id": 566,"Record function id": 0, "Ev Idx": 53
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 3416838, "tid": 3420252,
"ts": 6079338130854.310, "dur": 8.503,
"args": {
"External id": 567,"Record function id": 0, "Ev Idx": 54
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3420252,
"ts": 6079338130871.256, "dur": 7.862,
"args": {
"External id": 568,"Record function id": 0, "Ev Idx": 55
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130915.753, "dur": 8.092,
"args": {
"External id": 569,"Record function id": 0, "Ev Idx": 56
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130917.375, "dur": 4.888,
"args": {
"External id": 570,"Record function id": 0, "Ev Idx": 57
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130918.988, "dur": 2.444,
"args": {
"External id": 571,"Record function id": 0, "Ev Idx": 58
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130919.479, "dur": 1.812,
"args": {
"External id": 572,"Record function id": 0, "Ev Idx": 59
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130927.981, "dur": 3.566,
"args": {
"External id": 573,"Record function id": 0, "Ev Idx": 60
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130929.053, "dur": 1.803,
"args": {
"External id": 574,"Record function id": 0, "Ev Idx": 61
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130929.804, "dur": 0.681,
"args": {
"External id": 575,"Record function id": 0, "Ev Idx": 62
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130929.984, "dur": 0.391,
"args": {
"External id": 576,"Record function id": 0, "Ev Idx": 63
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130936.134, "dur": 3.405,
"args": {
"External id": 577,"Record function id": 0, "Ev Idx": 64
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338130937.185, "dur": 1.633,
"args": {
"External id": 578,"Record function id": 0, "Ev Idx": 65
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130937.886, "dur": 0.641,
"args": {
"External id": 579,"Record function id": 0, "Ev Idx": 66
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338130938.057, "dur": 0.370,
"args": {
"External id": 580,"Record function id": 0, "Ev Idx": 67
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338131584.291, "dur": 171.078,
"args": {
"External id": 581,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 68
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 3416838, "tid": 3420252,
"ts": 6079338131586.855, "dur": 154.994,
"args": {
"External id": 582,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 1, "Ev Idx": 69
}
},
{
"ph": "f", "id": 5, "pid": 3416838, "tid": 3420252, "ts": 6079338131586.855,
"cat": "fwdbwd", "name": "fwdbwd", "bp": "e"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_0", "pid": 3416838, "tid": 3420252,
"ts": 6079338131677.161, "dur": 20.862,
"args": {
"External id": 583,"Record function id": 0, "Ev Idx": 70
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_1", "pid": 3416838, "tid": 3420252,
"ts": 6079338131705.203, "dur": 8.042,
"args": {
"External id": 584,"Record function id": 0, "Ev Idx": 71
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3420252,
"ts": 6079338131721.738, "dur": 7.602,
"args": {
"External id": 585,"Record function id": 0, "Ev Idx": 72
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131764.062, "dur": 8.232,
"args": {
"External id": 586,"Record function id": 0, "Ev Idx": 73
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131765.675, "dur": 5.648,
"args": {
"External id": 587,"Record function id": 0, "Ev Idx": 74
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131767.858, "dur": 2.894,
"args": {
"External id": 588,"Record function id": 0, "Ev Idx": 75
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131768.709, "dur": 1.923,
"args": {
"External id": 589,"Record function id": 0, "Ev Idx": 76
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131776.381, "dur": 3.936,
"args": {
"External id": 590,"Record function id": 0, "Ev Idx": 77
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131777.462, "dur": 2.144,
"args": {
"External id": 591,"Record function id": 0, "Ev Idx": 78
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131778.203, "dur": 0.912,
"args": {
"External id": 592,"Record function id": 0, "Ev Idx": 79
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131778.394, "dur": 0.631,
"args": {
"External id": 593,"Record function id": 0, "Ev Idx": 80
}
},
{
"ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131784.253, "dur": 3.835,
"args": {
"External id": 594,"Record function id": 0, "Ev Idx": 81
}
},
{
"ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 3416838, "tid": 3420252,
"ts": 6079338131785.494, "dur": 1.933,
"args": {
"External id": 595,"Record function id": 0, "Ev Idx": 82
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131786.185, "dur": 0.922,
"args": {
"External id": 596,"Record function id": 0, "Ev Idx": 83
}
},
{
"ph": "X", "cat": "cpu_op", "name": "detach", "pid": 3416838, "tid": 3420252,
"ts": 6079338131786.456, "dur": 0.551,
"args": {
"External id": 597,"Record function id": 0, "Ev Idx": 84
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 0", "pid": 3416838, "tid": 3416838,
"ts": 6079338125248.388, "dur": 3104.821,
"args": {
"External id": 1,"Record function id": 0, "Ev Idx": 85
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 3416838, "tid": 3416838,
"ts": 6079338125327.908, "dur": 66.961,
"args": {
"External id": 2,"Record function id": 0, "Ev Idx": 86
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 3416838, "tid": 3416838,
"ts": 6079338125398.615, "dur": 2876.707,
"args": {
"External id": 3,"Record function id": 0, "Ev Idx": 87
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 3416838, "tid": 3416838,
"ts": 6079338125414.078, "dur": 5.478,
"args": {
"External id": 4,"Record function id": 0, "Ev Idx": 88
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 3416838, "tid": 3416838,
"ts": 6079338125492.987, "dur": 1788.066,
"args": {
"External id": 5,"Record function id": 0, "Sequence number": 127, "Fwd thread id": 0, "Ev Idx": 89
}
},
{
"ph": "s", "id": 1, "pid": 3416838, "tid": 3416838, "ts": 6079338125492.987,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 3416838, "tid": 3416838,
"ts": 6079338126492.474, "dur": 63.816,
"args": {
"External id": 6,"Record function id": 0, "Ev Idx": 90
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 3416838, "tid": 3416838,
"ts": 6079338126575.158, "dur": 13.711,
"args": {
"External id": 7,"Record function id": 0, "Ev Idx": 91
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3416838,
"ts": 6079338126608.699, "dur": 13.661,
"args": {
"External id": 8,"Record function id": 0, "Ev Idx": 92
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338126649.100, "dur": 55.213,
"args": {
"External id": 9,"Record function id": 0, "Ev Idx": 93
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338126662.781, "dur": 37.947,
"args": {
"External id": 10,"Record function id": 0, "Ev Idx": 94
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338126671.043, "dur": 23.976,
"args": {
"External id": 11,"Record function id": 0, "Ev Idx": 95
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338126683.241, "dur": 11.217,
"args": {
"External id": 12,"Record function id": 0, "Ev Idx": 96
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338126687.177, "dur": 5.128,
"args": {
"External id": 13,"Record function id": 0, "Ev Idx": 97
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338126715.941, "dur": 3.375,
"args": {
"External id": 14,"Record function id": 0, "Ev Idx": 98
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338126716.391, "dur": 2.634,
"args": {
"External id": 15,"Record function id": 0, "Ev Idx": 99
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338126716.862, "dur": 1.613,
"args": {
"External id": 16,"Record function id": 0, "Ev Idx": 100
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338126717.223, "dur": 1.051,
"args": {
"External id": 17,"Record function id": 0, "Ev Idx": 101
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338126717.603, "dur": 0.381,
"args": {
"External id": 18,"Record function id": 0, "Ev Idx": 102
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338126722.671, "dur": 4.176,
"args": {
"External id": 19,"Record function id": 0, "Ev Idx": 103
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338126722.991, "dur": 3.606,
"args": {
"External id": 20,"Record function id": 0, "Ev Idx": 104
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338126723.362, "dur": 2.664,
"args": {
"External id": 21,"Record function id": 0, "Ev Idx": 105
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338126723.702, "dur": 2.144,
"args": {
"External id": 22,"Record function id": 0, "Ev Idx": 106
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338126723.983, "dur": 1.632,
"args": {
"External id": 23,"Record function id": 0, "Ev Idx": 107
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338126730.202, "dur": 2.564,
"args": {
"External id": 24,"Record function id": 0, "Ev Idx": 108
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338126730.503, "dur": 2.013,
"args": {
"External id": 25,"Record function id": 0, "Ev Idx": 109
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338126730.863, "dur": 1.232,
"args": {
"External id": 26,"Record function id": 0, "Ev Idx": 110
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338126731.194, "dur": 0.761,
"args": {
"External id": 27,"Record function id": 0, "Ev Idx": 111
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338126731.474, "dur": 0.281,
"args": {
"External id": 28,"Record function id": 0, "Ev Idx": 112
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 3416838, "tid": 3416838,
"ts": 6079338126748.039, "dur": 13.551,
"args": {
"External id": 29,"Record function id": 0, "Ev Idx": 113
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 3416838, "tid": 3416838,
"ts": 6079338126774.048, "dur": 11.297,
"args": {
"External id": 30,"Record function id": 0, "Ev Idx": 114
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 3416838, "tid": 3416838,
"ts": 6079338126796.262, "dur": 10.095,
"args": {
"External id": 31,"Record function id": 0, "Ev Idx": 115
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338126838.646, "dur": 93.411,
"args": {
"External id": 32,"Record function id": 0, "Ev Idx": 116
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 3416838, "tid": 3416838,
"ts": 6079338126954.410, "dur": 20.191,
"args": {
"External id": 33,"Record function id": 0, "Ev Idx": 117
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338126982.723, "dur": 35.073,
"args": {
"External id": 34,"Record function id": 0, "Ev Idx": 118
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 3416838, "tid": 3416838,
"ts": 6079338127034.892, "dur": 13.901,
"args": {
"External id": 35,"Record function id": 0, "Ev Idx": 119
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 3416838, "tid": 3416838,
"ts": 6079338127060.670, "dur": 85.499,
"args": {
"External id": 36,"Record function id": 0, "Ev Idx": 120
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 3416838, "tid": 3416838,
"ts": 6079338127172.078, "dur": 13.401,
"args": {
"External id": 37,"Record function id": 0, "Ev Idx": 121
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 3416838, "tid": 3416838,
"ts": 6079338127196.706, "dur": 11.537,
"args": {
"External id": 38,"Record function id": 0, "Ev Idx": 122
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338127352.340, "dur": 53.361,
"args": {
"External id": 39,"Record function id": 0, "Ev Idx": 123
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338127354.894, "dur": 20.521,
"args": {
"External id": 40,"Record function id": 0, "Ev Idx": 124
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338127363.177, "dur": 11.647,
"args": {
"External id": 41,"Record function id": 0, "Ev Idx": 125
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 3416838, "tid": 3416838,
"ts": 6079338127381.715, "dur": 23.735,
"args": {
"External id": 42,"Record function id": 0, "Ev Idx": 126
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 1", "pid": 3416838, "tid": 3416838,
"ts": 6079338128367.100, "dur": 914.439,
"args": {
"External id": 43,"Record function id": 0, "Ev Idx": 127
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 3416838, "tid": 3416838,
"ts": 6079338128395.133, "dur": 20.671,
"args": {
"External id": 44,"Record function id": 0, "Ev Idx": 128
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 3416838, "tid": 3416838,
"ts": 6079338128416.445, "dur": 829.480,
"args": {
"External id": 45,"Record function id": 0, "Ev Idx": 129
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 3416838, "tid": 3416838,
"ts": 6079338128420.811, "dur": 2.524,
"args": {
"External id": 46,"Record function id": 0, "Ev Idx": 130
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 3416838, "tid": 3416838,
"ts": 6079338128455.013, "dur": 456.188,
"args": {
"External id": 47,"Record function id": 0, "Sequence number": 128, "Fwd thread id": 0, "Ev Idx": 131
}
},
{
"ph": "s", "id": 2, "pid": 3416838, "tid": 3416838, "ts": 6079338128455.013,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 3416838, "tid": 3416838,
"ts": 6079338128516.105, "dur": 27.481,
"args": {
"External id": 48,"Record function id": 0, "Ev Idx": 132
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 3416838, "tid": 3416838,
"ts": 6079338128554.182, "dur": 9.625,
"args": {
"External id": 49,"Record function id": 0, "Ev Idx": 133
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3416838,
"ts": 6079338128572.760, "dur": 8.032,
"args": {
"External id": 50,"Record function id": 0, "Ev Idx": 134
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338128588.805, "dur": 12.018,
"args": {
"External id": 51,"Record function id": 0, "Ev Idx": 135
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338128591.078, "dur": 8.883,
"args": {
"External id": 52,"Record function id": 0, "Ev Idx": 136
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338128593.161, "dur": 5.158,
"args": {
"External id": 53,"Record function id": 0, "Ev Idx": 137
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338128595.034, "dur": 2.944,
"args": {
"External id": 54,"Record function id": 0, "Ev Idx": 138
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338128596.206, "dur": 1.111,
"args": {
"External id": 55,"Record function id": 0, "Ev Idx": 139
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338128607.312, "dur": 3.606,
"args": {
"External id": 56,"Record function id": 0, "Ev Idx": 140
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338128607.743, "dur": 2.884,
"args": {
"External id": 57,"Record function id": 0, "Ev Idx": 141
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338128608.194, "dur": 1.863,
"args": {
"External id": 58,"Record function id": 0, "Ev Idx": 142
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338128608.584, "dur": 1.272,
"args": {
"External id": 59,"Record function id": 0, "Ev Idx": 143
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338128608.955, "dur": 0.631,
"args": {
"External id": 60,"Record function id": 0, "Ev Idx": 144
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338128614.363, "dur": 2.924,
"args": {
"External id": 61,"Record function id": 0, "Ev Idx": 145
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338128614.684, "dur": 2.373,
"args": {
"External id": 62,"Record function id": 0, "Ev Idx": 146
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338128615.064, "dur": 1.542,
"args": {
"External id": 63,"Record function id": 0, "Ev Idx": 147
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338128615.415, "dur": 1.031,
"args": {
"External id": 64,"Record function id": 0, "Ev Idx": 148
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338128615.705, "dur": 0.531,
"args": {
"External id": 65,"Record function id": 0, "Ev Idx": 149
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338128620.623, "dur": 3.184,
"args": {
"External id": 66,"Record function id": 0, "Ev Idx": 150
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338128620.943, "dur": 2.614,
"args": {
"External id": 67,"Record function id": 0, "Ev Idx": 151
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338128621.294, "dur": 1.602,
"args": {
"External id": 68,"Record function id": 0, "Ev Idx": 152
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338128621.614, "dur": 1.122,
"args": {
"External id": 69,"Record function id": 0, "Ev Idx": 153
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338128621.894, "dur": 0.651,
"args": {
"External id": 70,"Record function id": 0, "Ev Idx": 154
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 3416838, "tid": 3416838,
"ts": 6079338128635.024, "dur": 9.274,
"args": {
"External id": 71,"Record function id": 0, "Ev Idx": 155
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 3416838, "tid": 3416838,
"ts": 6079338128651.459, "dur": 6.910,
"args": {
"External id": 72,"Record function id": 0, "Ev Idx": 156
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 3416838, "tid": 3416838,
"ts": 6079338128665.650, "dur": 6.150,
"args": {
"External id": 73,"Record function id": 0, "Ev Idx": 157
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338128690.468, "dur": 24.226,
"args": {
"External id": 74,"Record function id": 0, "Ev Idx": 158
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 3416838, "tid": 3416838,
"ts": 6079338128728.665, "dur": 14.703,
"args": {
"External id": 75,"Record function id": 0, "Ev Idx": 159
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338128751.270, "dur": 13.810,
"args": {
"External id": 76,"Record function id": 0, "Ev Idx": 160
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 3416838, "tid": 3416838,
"ts": 6079338128779.432, "dur": 9.685,
"args": {
"External id": 77,"Record function id": 0, "Ev Idx": 161
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 3416838, "tid": 3416838,
"ts": 6079338128797.369, "dur": 32.419,
"args": {
"External id": 78,"Record function id": 0, "Ev Idx": 162
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 3416838, "tid": 3416838,
"ts": 6079338128847.755, "dur": 11.567,
"args": {
"External id": 79,"Record function id": 0, "Ev Idx": 163
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 3416838, "tid": 3416838,
"ts": 6079338128867.405, "dur": 8.012,
"args": {
"External id": 80,"Record function id": 0, "Ev Idx": 164
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338128938.191, "dur": 18.638,
"args": {
"External id": 81,"Record function id": 0, "Ev Idx": 165
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338128938.942, "dur": 5.990,
"args": {
"External id": 82,"Record function id": 0, "Ev Idx": 166
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338128940.044, "dur": 4.437,
"args": {
"External id": 83,"Record function id": 0, "Ev Idx": 167
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 3416838, "tid": 3416838,
"ts": 6079338128945.743, "dur": 10.846,
"args": {
"External id": 84,"Record function id": 0, "Ev Idx": 168
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 2", "pid": 3416838, "tid": 3416838,
"ts": 6079338129291.965, "dur": 847.608,
"args": {
"External id": 85,"Record function id": 0, "Ev Idx": 169
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 3416838, "tid": 3416838,
"ts": 6079338129320.958, "dur": 15.093,
"args": {
"External id": 86,"Record function id": 0, "Ev Idx": 170
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 3416838, "tid": 3416838,
"ts": 6079338129336.702, "dur": 768.819,
"args": {
"External id": 87,"Record function id": 0, "Ev Idx": 171
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 3416838, "tid": 3416838,
"ts": 6079338129340.598, "dur": 1.893,
"args": {
"External id": 88,"Record function id": 0, "Ev Idx": 172
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 3416838, "tid": 3416838,
"ts": 6079338129370.113, "dur": 420.043,
"args": {
"External id": 89,"Record function id": 0, "Sequence number": 129, "Fwd thread id": 0, "Ev Idx": 173
}
},
{
"ph": "s", "id": 3, "pid": 3416838, "tid": 3416838, "ts": 6079338129370.113,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 3416838, "tid": 3416838,
"ts": 6079338129423.593, "dur": 23.746,
"args": {
"External id": 90,"Record function id": 0, "Ev Idx": 174
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 3416838, "tid": 3416838,
"ts": 6079338129456.673, "dur": 8.733,
"args": {
"External id": 91,"Record function id": 0, "Ev Idx": 175
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3416838,
"ts": 6079338129474.079, "dur": 8.443,
"args": {
"External id": 92,"Record function id": 0, "Ev Idx": 176
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338129490.414, "dur": 11.367,
"args": {
"External id": 93,"Record function id": 0, "Ev Idx": 177
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338129492.127, "dur": 8.773,
"args": {
"External id": 94,"Record function id": 0, "Ev Idx": 178
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338129494.290, "dur": 5.198,
"args": {
"External id": 95,"Record function id": 0, "Ev Idx": 179
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338129496.263, "dur": 2.884,
"args": {
"External id": 96,"Record function id": 0, "Ev Idx": 180
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338129497.204, "dur": 1.262,
"args": {
"External id": 97,"Record function id": 0, "Ev Idx": 181
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338129508.181, "dur": 3.265,
"args": {
"External id": 98,"Record function id": 0, "Ev Idx": 182
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338129508.571, "dur": 2.534,
"args": {
"External id": 99,"Record function id": 0, "Ev Idx": 183
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338129509.012, "dur": 1.592,
"args": {
"External id": 100,"Record function id": 0, "Ev Idx": 184
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338129509.413, "dur": 1.001,
"args": {
"External id": 101,"Record function id": 0, "Ev Idx": 185
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338129509.783, "dur": 0.361,
"args": {
"External id": 102,"Record function id": 0, "Ev Idx": 186
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338129514.620, "dur": 2.955,
"args": {
"External id": 103,"Record function id": 0, "Ev Idx": 187
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338129514.911, "dur": 2.414,
"args": {
"External id": 104,"Record function id": 0, "Ev Idx": 188
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338129515.291, "dur": 1.543,
"args": {
"External id": 105,"Record function id": 0, "Ev Idx": 189
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338129515.662, "dur": 1.002,
"args": {
"External id": 106,"Record function id": 0, "Ev Idx": 190
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338129515.983, "dur": 0.460,
"args": {
"External id": 107,"Record function id": 0, "Ev Idx": 191
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338129520.449, "dur": 2.584,
"args": {
"External id": 108,"Record function id": 0, "Ev Idx": 192
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338129520.760, "dur": 2.053,
"args": {
"External id": 109,"Record function id": 0, "Ev Idx": 193
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338129521.090, "dur": 1.272,
"args": {
"External id": 110,"Record function id": 0, "Ev Idx": 194
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338129521.441, "dur": 0.781,
"args": {
"External id": 111,"Record function id": 0, "Ev Idx": 195
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338129521.741, "dur": 0.271,
"args": {
"External id": 112,"Record function id": 0, "Ev Idx": 196
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 3416838, "tid": 3416838,
"ts": 6079338129533.168, "dur": 8.503,
"args": {
"External id": 113,"Record function id": 0, "Ev Idx": 197
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 3416838, "tid": 3416838,
"ts": 6079338129548.662, "dur": 7.191,
"args": {
"External id": 114,"Record function id": 0, "Ev Idx": 198
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 3416838, "tid": 3416838,
"ts": 6079338129563.114, "dur": 5.979,
"args": {
"External id": 115,"Record function id": 0, "Ev Idx": 199
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338129588.021, "dur": 22.524,
"args": {
"External id": 116,"Record function id": 0, "Ev Idx": 200
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 3416838, "tid": 3416838,
"ts": 6079338129623.114, "dur": 13.060,
"args": {
"External id": 117,"Record function id": 0, "Ev Idx": 201
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338129646.509, "dur": 12.619,
"args": {
"External id": 118,"Record function id": 0, "Ev Idx": 202
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 3416838, "tid": 3416838,
"ts": 6079338129669.704, "dur": 8.774,
"args": {
"External id": 119,"Record function id": 0, "Ev Idx": 203
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 3416838, "tid": 3416838,
"ts": 6079338129685.748, "dur": 31.017,
"args": {
"External id": 120,"Record function id": 0, "Ev Idx": 204
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 3416838, "tid": 3416838,
"ts": 6079338129734.372, "dur": 10.145,
"args": {
"External id": 121,"Record function id": 0, "Ev Idx": 205
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 3416838, "tid": 3416838,
"ts": 6079338129751.377, "dur": 8.243,
"args": {
"External id": 122,"Record function id": 0, "Ev Idx": 206
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338129813.922, "dur": 18.127,
"args": {
"External id": 123,"Record function id": 0, "Ev Idx": 207
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338129814.553, "dur": 5.638,
"args": {
"External id": 124,"Record function id": 0, "Ev Idx": 208
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338129815.624, "dur": 4.167,
"args": {
"External id": 125,"Record function id": 0, "Ev Idx": 209
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 3416838, "tid": 3416838,
"ts": 6079338129820.842, "dur": 10.967,
"args": {
"External id": 126,"Record function id": 0, "Ev Idx": 210
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 3", "pid": 3416838, "tid": 3416838,
"ts": 6079338130149.758, "dur": 849.241,
"args": {
"External id": 127,"Record function id": 0, "Ev Idx": 211
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 3416838, "tid": 3416838,
"ts": 6079338130168.897, "dur": 14.221,
"args": {
"External id": 128,"Record function id": 0, "Ev Idx": 212
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 3416838, "tid": 3416838,
"ts": 6079338130183.930, "dur": 781.879,
"args": {
"External id": 129,"Record function id": 0, "Ev Idx": 213
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 3416838, "tid": 3416838,
"ts": 6079338130187.595, "dur": 1.963,
"args": {
"External id": 130,"Record function id": 0, "Ev Idx": 214
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 3416838, "tid": 3416838,
"ts": 6079338130216.088, "dur": 434.736,
"args": {
"External id": 131,"Record function id": 0, "Sequence number": 130, "Fwd thread id": 0, "Ev Idx": 215
}
},
{
"ph": "s", "id": 4, "pid": 3416838, "tid": 3416838, "ts": 6079338130216.088,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 3416838, "tid": 3416838,
"ts": 6079338130269.188, "dur": 23.906,
"args": {
"External id": 132,"Record function id": 0, "Ev Idx": 216
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 3416838, "tid": 3416838,
"ts": 6079338130310.460, "dur": 10.396,
"args": {
"External id": 133,"Record function id": 0, "Ev Idx": 217
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3416838,
"ts": 6079338130329.339, "dur": 7.561,
"args": {
"External id": 134,"Record function id": 0, "Ev Idx": 218
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338130344.241, "dur": 10.947,
"args": {
"External id": 135,"Record function id": 0, "Ev Idx": 219
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338130346.014, "dur": 8.313,
"args": {
"External id": 136,"Record function id": 0, "Ev Idx": 220
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338130347.767, "dur": 5.107,
"args": {
"External id": 137,"Record function id": 0, "Ev Idx": 221
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338130349.690, "dur": 2.854,
"args": {
"External id": 138,"Record function id": 0, "Ev Idx": 222
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338130350.611, "dur": 1.232,
"args": {
"External id": 139,"Record function id": 0, "Ev Idx": 223
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338130361.427, "dur": 4.327,
"args": {
"External id": 140,"Record function id": 0, "Ev Idx": 224
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338130361.868, "dur": 3.585,
"args": {
"External id": 141,"Record function id": 0, "Ev Idx": 225
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338130362.469, "dur": 2.093,
"args": {
"External id": 142,"Record function id": 0, "Ev Idx": 226
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338130362.940, "dur": 1.422,
"args": {
"External id": 143,"Record function id": 0, "Ev Idx": 227
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338130363.510, "dur": 0.561,
"args": {
"External id": 144,"Record function id": 0, "Ev Idx": 228
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338130368.888, "dur": 2.845,
"args": {
"External id": 145,"Record function id": 0, "Ev Idx": 229
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338130369.189, "dur": 2.303,
"args": {
"External id": 146,"Record function id": 0, "Ev Idx": 230
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338130369.570, "dur": 1.422,
"args": {
"External id": 147,"Record function id": 0, "Ev Idx": 231
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338130369.940, "dur": 0.881,
"args": {
"External id": 148,"Record function id": 0, "Ev Idx": 232
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338130370.241, "dur": 0.370,
"args": {
"External id": 149,"Record function id": 0, "Ev Idx": 233
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338130374.557, "dur": 2.985,
"args": {
"External id": 150,"Record function id": 0, "Ev Idx": 234
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338130374.948, "dur": 2.343,
"args": {
"External id": 151,"Record function id": 0, "Ev Idx": 235
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338130375.308, "dur": 1.322,
"args": {
"External id": 152,"Record function id": 0, "Ev Idx": 236
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338130375.629, "dur": 0.861,
"args": {
"External id": 153,"Record function id": 0, "Ev Idx": 237
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338130375.999, "dur": 0.281,
"args": {
"External id": 154,"Record function id": 0, "Ev Idx": 238
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 3416838, "tid": 3416838,
"ts": 6079338130387.086, "dur": 8.413,
"args": {
"External id": 155,"Record function id": 0, "Ev Idx": 239
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 3416838, "tid": 3416838,
"ts": 6079338130402.269, "dur": 6.770,
"args": {
"External id": 156,"Record function id": 0, "Ev Idx": 240
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 3416838, "tid": 3416838,
"ts": 6079338130415.669, "dur": 5.889,
"args": {
"External id": 157,"Record function id": 0, "Ev Idx": 241
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338130441.368, "dur": 22.584,
"args": {
"External id": 158,"Record function id": 0, "Ev Idx": 242
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 3416838, "tid": 3416838,
"ts": 6079338130477.933, "dur": 14.402,
"args": {
"External id": 159,"Record function id": 0, "Ev Idx": 243
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338130500.447, "dur": 13.250,
"args": {
"External id": 160,"Record function id": 0, "Ev Idx": 244
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 3416838, "tid": 3416838,
"ts": 6079338130527.127, "dur": 8.633,
"args": {
"External id": 161,"Record function id": 0, "Ev Idx": 245
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 3416838, "tid": 3416838,
"ts": 6079338130542.480, "dur": 33.000,
"args": {
"External id": 162,"Record function id": 0, "Ev Idx": 246
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 3416838, "tid": 3416838,
"ts": 6079338130593.026, "dur": 10.016,
"args": {
"External id": 163,"Record function id": 0, "Ev Idx": 247
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 3416838, "tid": 3416838,
"ts": 6079338130610.533, "dur": 8.993,
"args": {
"External id": 164,"Record function id": 0, "Ev Idx": 248
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338130676.763, "dur": 17.656,
"args": {
"External id": 165,"Record function id": 0, "Ev Idx": 249
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338130677.424, "dur": 5.248,
"args": {
"External id": 166,"Record function id": 0, "Ev Idx": 250
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338130678.215, "dur": 3.996,
"args": {
"External id": 167,"Record function id": 0, "Ev Idx": 251
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 3416838, "tid": 3416838,
"ts": 6079338130683.262, "dur": 10.937,
"args": {
"External id": 168,"Record function id": 0, "Ev Idx": 252
}
},
{
"ph": "X", "cat": "user_annotation", "name": "Step 4", "pid": 3416838, "tid": 3416838,
"ts": 6079338131009.635, "dur": 837.983,
"args": {
"External id": 169,"Record function id": 0, "Ev Idx": 253
}
},
{
"ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 3416838, "tid": 3416838,
"ts": 6079338131029.004, "dur": 13.250,
"args": {
"External id": 170,"Record function id": 0, "Ev Idx": 254
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 3416838, "tid": 3416838,
"ts": 6079338131042.895, "dur": 771.243,
"args": {
"External id": 171,"Record function id": 0, "Ev Idx": 255
}
},
{
"ph": "X", "cat": "cpu_op", "name": "Pregraph bytecode", "pid": 3416838, "tid": 3416838,
"ts": 6079338131046.821, "dur": 1.662,
"args": {
"External id": 172,"Record function id": 0, "Ev Idx": 256
}
},
{
"ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 3416838, "tid": 3416838,
"ts": 6079338131075.784, "dur": 430.299,
"args": {
"External id": 173,"Record function id": 0, "Sequence number": 131, "Fwd thread id": 0, "Ev Idx": 257
}
},
{
"ph": "s", "id": 5, "pid": 3416838, "tid": 3416838, "ts": 6079338131075.784,
"cat": "fwdbwd", "name": "fwdbwd"
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 3416838, "tid": 3416838,
"ts": 6079338131129.465, "dur": 23.686,
"args": {
"External id": 174,"Record function id": 0, "Ev Idx": 258
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_1", "pid": 3416838, "tid": 3416838,
"ts": 6079338131161.243, "dur": 8.763,
"args": {
"External id": 175,"Record function id": 0, "Ev Idx": 259
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_mul_2", "pid": 3416838, "tid": 3416838,
"ts": 6079338131178.359, "dur": 8.182,
"args": {
"External id": 176,"Record function id": 0, "Ev Idx": 260
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338131194.253, "dur": 11.688,
"args": {
"External id": 177,"Record function id": 0, "Ev Idx": 261
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338131195.905, "dur": 8.924,
"args": {
"External id": 178,"Record function id": 0, "Ev Idx": 262
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338131198.029, "dur": 5.308,
"args": {
"External id": 179,"Record function id": 0, "Ev Idx": 263
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338131199.781, "dur": 3.235,
"args": {
"External id": 180,"Record function id": 0, "Ev Idx": 264
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338131200.843, "dur": 1.492,
"args": {
"External id": 181,"Record function id": 0, "Ev Idx": 265
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338131212.250, "dur": 3.495,
"args": {
"External id": 182,"Record function id": 0, "Ev Idx": 266
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338131212.691, "dur": 2.764,
"args": {
"External id": 183,"Record function id": 0, "Ev Idx": 267
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338131213.161, "dur": 1.773,
"args": {
"External id": 184,"Record function id": 0, "Ev Idx": 268
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338131213.582, "dur": 1.152,
"args": {
"External id": 185,"Record function id": 0, "Ev Idx": 269
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338131213.943, "dur": 0.520,
"args": {
"External id": 186,"Record function id": 0, "Ev Idx": 270
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338131219.020, "dur": 2.795,
"args": {
"External id": 187,"Record function id": 0, "Ev Idx": 271
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338131219.321, "dur": 2.273,
"args": {
"External id": 188,"Record function id": 0, "Ev Idx": 272
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338131219.761, "dur": 1.403,
"args": {
"External id": 189,"Record function id": 0, "Ev Idx": 273
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338131220.122, "dur": 0.881,
"args": {
"External id": 190,"Record function id": 0, "Ev Idx": 274
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338131220.422, "dur": 0.381,
"args": {
"External id": 191,"Record function id": 0, "Ev Idx": 275
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::chunk", "pid": 3416838, "tid": 3416838,
"ts": 6079338131225.260, "dur": 2.804,
"args": {
"External id": 192,"Record function id": 0, "Ev Idx": 276
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::split", "pid": 3416838, "tid": 3416838,
"ts": 6079338131225.550, "dur": 2.274,
"args": {
"External id": 193,"Record function id": 0, "Ev Idx": 277
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 3416838, "tid": 3416838,
"ts": 6079338131225.911, "dur": 1.442,
"args": {
"External id": 194,"Record function id": 0, "Ev Idx": 278
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 3416838, "tid": 3416838,
"ts": 6079338131226.291, "dur": 0.912,
"args": {
"External id": 195,"Record function id": 0, "Ev Idx": 279
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338131226.602, "dur": 0.440,
"args": {
"External id": 196,"Record function id": 0, "Ev Idx": 280
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_3", "pid": 3416838, "tid": 3416838,
"ts": 6079338131239.271, "dur": 8.923,
"args": {
"External id": 197,"Record function id": 0, "Ev Idx": 281
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_4", "pid": 3416838, "tid": 3416838,
"ts": 6079338131255.165, "dur": 7.040,
"args": {
"External id": 198,"Record function id": 0, "Ev Idx": 282
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_5", "pid": 3416838, "tid": 3416838,
"ts": 6079338131268.615, "dur": 5.488,
"args": {
"External id": 199,"Record function id": 0, "Ev Idx": 283
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338131293.493, "dur": 34.221,
"args": {
"External id": 200,"Record function id": 0, "Ev Idx": 284
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 3416838, "tid": 3416838,
"ts": 6079338131341.525, "dur": 13.741,
"args": {
"External id": 201,"Record function id": 0, "Ev Idx": 285
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 3416838, "tid": 3416838,
"ts": 6079338131363.308, "dur": 12.368,
"args": {
"External id": 202,"Record function id": 0, "Ev Idx": 286
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 3416838, "tid": 3416838,
"ts": 6079338131386.533, "dur": 8.272,
"args": {
"External id": 203,"Record function id": 0, "Ev Idx": 287
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::addmm", "pid": 3416838, "tid": 3416838,
"ts": 6079338131401.525, "dur": 31.558,
"args": {
"External id": 204,"Record function id": 0, "Ev Idx": 288
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_red_fused_nll_loss_forward_8", "pid": 3416838, "tid": 3416838,
"ts": 6079338131450.229, "dur": 9.705,
"args": {
"External id": 205,"Record function id": 0, "Ev Idx": 289
}
},
{
"ph": "X", "cat": "cpu_op", "name": "triton_per_fused_nll_loss_forward_9", "pid": 3416838, "tid": 3416838,
"ts": 6079338131466.874, "dur": 8.303,
"args": {
"External id": 206,"Record function id": 0, "Ev Idx": 290
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338131530.530, "dur": 16.765,
"args": {
"External id": 207,"Record function id": 0, "Ev Idx": 291
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 3416838, "tid": 3416838,
"ts": 6079338131531.201, "dur": 5.308,
"args": {
"External id": 208,"Record function id": 0, "Ev Idx": 292
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 3416838, "tid": 3416838,
"ts": 6079338131532.052, "dur": 4.046,
"args": {
"External id": 209,"Record function id": 0, "Ev Idx": 293
}
},
{
"ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 3416838, "tid": 3416838,
"ts": 6079338131537.160, "dur": 9.905,
"args": {
"External id": 210,"Record function id": 0, "Ev Idx": 294
}
},
{
"ph": "X", "cat": "overhead", "name": "Unrecognized", "pid": -1, "tid": 0,
"ts": 6079338125539.867, "dur": 856.272
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6079338126554.437, "dur": 6.880,
"args": {
"External id": 6, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 30, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 30, "pid": 0, "tid": 7, "ts": 6079338126554.437,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126521.117, "dur": 34.001,
"args": {
"External id": 6, "cbid": 307, "correlation": 30
}
},
{
"ph": "s", "id": 30, "pid": 3416838, "tid": 3416838, "ts": 6079338126521.117,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6079338126589.477, "dur": 1.664,
"args": {
"External id": 7, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 38, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 38, "pid": 0, "tid": 7, "ts": 6079338126589.477,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126582.039, "dur": 6.039,
"args": {
"External id": 7, "cbid": 307, "correlation": 38
}
},
{
"ph": "s", "id": 38, "pid": 3416838, "tid": 3416838, "ts": 6079338126582.039,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338126623.621, "dur": 47.296,
"args": {
"External id": 8, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 46, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 46, "pid": 0, "tid": 7, "ts": 6079338126623.621,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126615.990, "dur": 5.608,
"args": {
"External id": 8, "cbid": 307, "correlation": 46
}
},
{
"ph": "s", "id": 46, "pid": 3416838, "tid": 3416838, "ts": 6079338126615.990,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6079338126762.565, "dur": 1.312,
"args": {
"External id": 29, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 53, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 53, "pid": 0, "tid": 7, "ts": 6079338126762.565,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126754.429, "dur": 6.369,
"args": {
"External id": 29, "cbid": 307, "correlation": 53
}
},
{
"ph": "s", "id": 53, "pid": 3416838, "tid": 3416838, "ts": 6079338126754.429,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6079338126786.885, "dur": 31.616,
"args": {
"External id": 30, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 60, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 60, "pid": 0, "tid": 7, "ts": 6079338126786.885,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126780.087, "dur": 4.758,
"args": {
"External id": 30, "cbid": 307, "correlation": 60
}
},
{
"ph": "s", "id": 60, "pid": 3416838, "tid": 3416838, "ts": 6079338126780.087,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6079338126819.365, "dur": 1.440,
"args": {
"External id": 31, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 67, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 67, "pid": 0, "tid": 7, "ts": 6079338126819.365,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126800.929, "dur": 5.068,
"args": {
"External id": 31, "cbid": 307, "correlation": 67
}
},
{
"ph": "s", "id": 67, "pid": 3416838, "tid": 3416838, "ts": 6079338126800.929,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338126918.166, "dur": 2.323,
"args": {
"External id": 32, "cbid": 200, "correlation": 82
}
},
{
"ph": "f", "id": 82, "pid": 3416838, "tid": 3416838, "ts": 6079338126918.166,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338126932.070, "dur": 26818.492,
"args": {
"External id": 32, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 84, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 84, "pid": 0, "tid": 7, "ts": 6079338126932.070,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126923.183, "dur": 6.610,
"args": {
"External id": 32, "cbid": 307, "correlation": 84
}
},
{
"ph": "s", "id": 84, "pid": 3416838, "tid": 3416838, "ts": 6079338126923.183,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6079338153751.458, "dur": 5564.774,
"args": {
"External id": 33, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 105, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 105, "pid": 0, "tid": 7, "ts": 6079338153751.458,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338126967.550, "dur": 6.219,
"args": {
"External id": 33, "cbid": 307, "correlation": 105
}
},
{
"ph": "s", "id": 105, "pid": 3416838, "tid": 3416838, "ts": 6079338126967.550,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338126994.941, "dur": 0.341,
"args": {
"External id": 34, "cbid": 200, "correlation": 118
}
},
{
"ph": "f", "id": 118, "pid": 3416838, "tid": 3416838, "ts": 6079338126994.941,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338159317.000, "dur": 19589.173,
"args": {
"External id": 34, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 121, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 121, "pid": 0, "tid": 7, "ts": 6079338159317.000,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127011.386, "dur": 5.318,
"args": {
"External id": 34, "cbid": 307, "correlation": 121
}
},
{
"ph": "s", "id": 121, "pid": 3416838, "tid": 3416838, "ts": 6079338127011.386,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6079338178907.133, "dur": 1656.673,
"args": {
"External id": 35, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 132, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 132, "pid": 0, "tid": 7, "ts": 6079338178907.133,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127042.804, "dur": 5.478,
"args": {
"External id": 35, "cbid": 307, "correlation": 132
}
},
{
"ph": "s", "id": 132, "pid": 3416838, "tid": 3416838, "ts": 6079338127042.804,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6079338180564.799, "dur": 75.327,
"args": {
"External id": 36, "device": 0, "context": 1, "stream": 7, "correlation": 139, "bytes": 77194752, "memory bandwidth (GB/s)": 1024.7952526982358
}
},
{
"ph": "f", "id": 139, "pid": 0, "tid": 7, "ts": 6079338180564.799,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 3416838, "tid": 3416838,
"ts": 6079338127089.734, "dur": 26.350,
"args": {
"External id": 36, "cbid": 41, "correlation": 139
}
},
{
"ph": "s", "id": 139, "pid": 3416838, "tid": 3416838, "ts": 6079338127089.734,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338127137.456, "dur": 0.651,
"args": {
"External id": 36, "cbid": 200, "correlation": 150
}
},
{
"ph": "f", "id": 150, "pid": 3416838, "tid": 3416838, "ts": 6079338127137.456,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6079338180641.054, "dur": 5841.382,
"args": {
"External id": 36, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 153, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 153, "pid": 0, "tid": 7, "ts": 6079338180641.054,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127140.190, "dur": 4.798,
"args": {
"External id": 36, "cbid": 307, "correlation": 153
}
},
{
"ph": "s", "id": 153, "pid": 3416838, "tid": 3416838, "ts": 6079338127140.190,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6079338186483.365, "dur": 2.656,
"args": {
"External id": 37, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 165, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 165, "pid": 0, "tid": 7, "ts": 6079338186483.365,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127179.940, "dur": 4.968,
"args": {
"External id": 37, "cbid": 307, "correlation": 165
}
},
{
"ph": "s", "id": 165, "pid": 3416838, "tid": 3416838, "ts": 6079338127179.940,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6079338186487.941, "dur": 1.791,
"args": {
"External id": 38, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 170, "pid": 0, "tid": 7, "ts": 6079338186487.941,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127203.426, "dur": 4.326,
"args": {
"External id": 38, "cbid": 307, "correlation": 170
}
},
{
"ph": "s", "id": 170, "pid": 3416838, "tid": 3416838, "ts": 6079338127203.426,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6079338186490.533, "dur": 1.568,
"args": {
"External id": 42, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 181, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 181, "pid": 0, "tid": 7, "ts": 6079338186490.533,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338127393.392, "dur": 11.057,
"args": {
"External id": 42, "cbid": 211, "correlation": 181
}
},
{
"ph": "s", "id": 181, "pid": 3416838, "tid": 3416838, "ts": 6079338127393.392,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3416838,
"ts": 6079338127453.863, "dur": 2.815,
"args": {
"External id": 3, "cbid": 135, "correlation": 189
}
},
{
"ph": "f", "id": 189, "pid": 3416838, "tid": 3416838, "ts": 6079338127453.863,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6079338186493.029, "dur": 60.960,
"args": {
"External id": 515, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 198, "pid": 0, "tid": 7, "ts": 6079338186493.029,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338127984.734, "dur": 44.797,
"args": {
"External id": 515, "cbid": 307, "correlation": 198
}
},
{
"ph": "s", "id": 198, "pid": 3416838, "tid": 3420252, "ts": 6079338127984.734,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6079338186554.885, "dur": 2.400,
"args": {
"External id": 516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 202, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 202, "pid": 0, "tid": 7, "ts": 6079338186554.885,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338128054.649, "dur": 7.161,
"args": {
"External id": 516, "cbid": 307, "correlation": 202
}
},
{
"ph": "s", "id": 202, "pid": 3416838, "tid": 3420252, "ts": 6079338128054.649,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338186559.333, "dur": 47.072,
"args": {
"External id": 517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 206, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 206, "pid": 0, "tid": 7, "ts": 6079338186559.333,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338128097.013, "dur": 5.769,
"args": {
"External id": 517, "cbid": 307, "correlation": 206
}
},
{
"ph": "s", "id": 206, "pid": 3416838, "tid": 3420252, "ts": 6079338128097.013,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338128152.266, "dur": 2.775,
"args": {
"External id": 513, "cbid": 135, "correlation": 211
}
},
{
"ph": "f", "id": 211, "pid": 3416838, "tid": 3420252, "ts": 6079338128152.266,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338128159.778, "dur": 0.460,
"args": {
"External id": 513, "cbid": 135, "correlation": 216
}
},
{
"ph": "f", "id": 216, "pid": 3416838, "tid": 3420252, "ts": 6079338128159.778,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338128164.124, "dur": 0.561,
"args": {
"External id": 513, "cbid": 135, "correlation": 221
}
},
{
"ph": "f", "id": 221, "pid": 3416838, "tid": 3420252, "ts": 6079338128164.124,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6079338186608.485, "dur": 7.936,
"args": {
"External id": 48, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 282, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 282, "pid": 0, "tid": 7, "ts": 6079338186608.485,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128531.067, "dur": 11.658,
"args": {
"External id": 48, "cbid": 307, "correlation": 282
}
},
{
"ph": "s", "id": 282, "pid": 3416838, "tid": 3416838, "ts": 6079338128531.067,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6079338186617.285, "dur": 1.600,
"args": {
"External id": 49, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 290, "pid": 0, "tid": 7, "ts": 6079338186617.285,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128559.150, "dur": 4.186,
"args": {
"External id": 49, "cbid": 307, "correlation": 290
}
},
{
"ph": "s", "id": 290, "pid": 3416838, "tid": 3416838, "ts": 6079338128559.150,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338186619.685, "dur": 48.192,
"args": {
"External id": 50, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 298, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 298, "pid": 0, "tid": 7, "ts": 6079338186619.685,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128577.097, "dur": 3.235,
"args": {
"External id": 50, "cbid": 307, "correlation": 298
}
},
{
"ph": "s", "id": 298, "pid": 3416838, "tid": 3416838, "ts": 6079338128577.097,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6079338186669.797, "dur": 1.696,
"args": {
"External id": 71, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 305, "pid": 0, "tid": 7, "ts": 6079338186669.797,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128639.040, "dur": 4.848,
"args": {
"External id": 71, "cbid": 307, "correlation": 305
}
},
{
"ph": "s", "id": 305, "pid": 3416838, "tid": 3416838, "ts": 6079338128639.040,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6079338186672.421, "dur": 31.488,
"args": {
"External id": 72, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 312, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 312, "pid": 0, "tid": 7, "ts": 6079338186672.421,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128655.275, "dur": 2.744,
"args": {
"External id": 72, "cbid": 307, "correlation": 312
}
},
{
"ph": "s", "id": 312, "pid": 3416838, "tid": 3416838, "ts": 6079338128655.275,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6079338186704.773, "dur": 1.536,
"args": {
"External id": 73, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 319, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 319, "pid": 0, "tid": 7, "ts": 6079338186704.773,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128668.915, "dur": 2.554,
"args": {
"External id": 73, "cbid": 307, "correlation": 319
}
},
{
"ph": "s", "id": 319, "pid": 3416838, "tid": 3416838, "ts": 6079338128668.915,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338128707.804, "dur": 0.691,
"args": {
"External id": 74, "cbid": 200, "correlation": 334
}
},
{
"ph": "f", "id": 334, "pid": 3416838, "tid": 3416838, "ts": 6079338128707.804,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338186707.141, "dur": 26820.252,
"args": {
"External id": 74, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 336, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 336, "pid": 0, "tid": 7, "ts": 6079338186707.141,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128709.306, "dur": 3.916,
"args": {
"External id": 74, "cbid": 307, "correlation": 336
}
},
{
"ph": "s", "id": 336, "pid": 3416838, "tid": 3416838, "ts": 6079338128709.306,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6079338213529.473, "dur": 5565.030,
"args": {
"External id": 75, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 357, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 357, "pid": 0, "tid": 7, "ts": 6079338213529.473,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128739.131, "dur": 3.616,
"args": {
"External id": 75, "cbid": 307, "correlation": 357
}
},
{
"ph": "s", "id": 357, "pid": 3416838, "tid": 3416838, "ts": 6079338128739.131,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338128759.021, "dur": 0.351,
"args": {
"External id": 76, "cbid": 200, "correlation": 370
}
},
{
"ph": "f", "id": 370, "pid": 3416838, "tid": 3416838, "ts": 6079338128759.021,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338219096.583, "dur": 19947.829,
"args": {
"External id": 76, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 373, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 373, "pid": 0, "tid": 7, "ts": 6079338219096.583,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128760.493, "dur": 3.496,
"args": {
"External id": 76, "cbid": 307, "correlation": 373
}
},
{
"ph": "s", "id": 373, "pid": 3416838, "tid": 3416838, "ts": 6079338128760.493,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6079338239045.277, "dur": 1659.425,
"args": {
"External id": 77, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 384, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 384, "pid": 0, "tid": 7, "ts": 6079338239045.277,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128785.491, "dur": 2.925,
"args": {
"External id": 77, "cbid": 307, "correlation": 384
}
},
{
"ph": "s", "id": 384, "pid": 3416838, "tid": 3416838, "ts": 6079338128785.491,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6079338240705.566, "dur": 76.704,
"args": {
"External id": 78, "device": 0, "context": 1, "stream": 7, "correlation": 391, "bytes": 77194752, "memory bandwidth (GB/s)": 1006.3979974968711
}
},
{
"ph": "f", "id": 391, "pid": 0, "tid": 7, "ts": 6079338240705.566,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 3416838, "tid": 3416838,
"ts": 6079338128804.299, "dur": 10.606,
"args": {
"External id": 78, "cbid": 41, "correlation": 391
}
},
{
"ph": "s", "id": 391, "pid": 3416838, "tid": 3416838, "ts": 6079338128804.299,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338128824.350, "dur": 0.240,
"args": {
"External id": 78, "cbid": 200, "correlation": 402
}
},
{
"ph": "f", "id": 402, "pid": 3416838, "tid": 3416838, "ts": 6079338128824.350,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6079338240783.102, "dur": 5841.479,
"args": {
"External id": 78, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 405, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 405, "pid": 0, "tid": 7, "ts": 6079338240783.102,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128825.642, "dur": 2.924,
"args": {
"External id": 78, "cbid": 307, "correlation": 405
}
},
{
"ph": "s", "id": 405, "pid": 3416838, "tid": 3416838, "ts": 6079338128825.642,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6079338246625.445, "dur": 2.464,
"args": {
"External id": 79, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 417, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 417, "pid": 0, "tid": 7, "ts": 6079338246625.445,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128854.475, "dur": 4.216,
"args": {
"External id": 79, "cbid": 307, "correlation": 417
}
},
{
"ph": "s", "id": 417, "pid": 3416838, "tid": 3416838, "ts": 6079338128854.475,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6079338246628.773, "dur": 1.888,
"args": {
"External id": 80, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 422, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 422, "pid": 0, "tid": 7, "ts": 6079338246628.773,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128872.202, "dur": 2.784,
"args": {
"External id": 80, "cbid": 307, "correlation": 422
}
},
{
"ph": "s", "id": 422, "pid": 3416838, "tid": 3416838, "ts": 6079338128872.202,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6079338246632.709, "dur": 1.536,
"args": {
"External id": 84, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 433, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 433, "pid": 0, "tid": 7, "ts": 6079338246632.709,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338128949.619, "dur": 6.039,
"args": {
"External id": 84, "cbid": 211, "correlation": 433
}
},
{
"ph": "s", "id": 433, "pid": 3416838, "tid": 3416838, "ts": 6079338128949.619,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3416838,
"ts": 6079338128974.917, "dur": 1.081,
"args": {
"External id": 45, "cbid": 135, "correlation": 441
}
},
{
"ph": "f", "id": 441, "pid": 3416838, "tid": 3416838, "ts": 6079338128974.917,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6079338246635.173, "dur": 60.864,
"args": {
"External id": 532, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 450, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 450, "pid": 0, "tid": 7, "ts": 6079338246635.173,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338129109.790, "dur": 11.868,
"args": {
"External id": 532, "cbid": 307, "correlation": 450
}
},
{
"ph": "s", "id": 450, "pid": 3416838, "tid": 3420252, "ts": 6079338129109.790,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6079338246696.965, "dur": 2.304,
"args": {
"External id": 533, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 454, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 454, "pid": 0, "tid": 7, "ts": 6079338246696.965,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338129133.646, "dur": 4.297,
"args": {
"External id": 533, "cbid": 307, "correlation": 454
}
},
{
"ph": "s", "id": 454, "pid": 3416838, "tid": 3420252, "ts": 6079338129133.646,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338246700.165, "dur": 48.000,
"args": {
"External id": 534, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 458, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 458, "pid": 0, "tid": 7, "ts": 6079338246700.165,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338129152.454, "dur": 3.756,
"args": {
"External id": 534, "cbid": 307, "correlation": 458
}
},
{
"ph": "s", "id": 458, "pid": 3416838, "tid": 3420252, "ts": 6079338129152.454,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338129176.901, "dur": 0.892,
"args": {
"External id": 530, "cbid": 135, "correlation": 463
}
},
{
"ph": "f", "id": 463, "pid": 3416838, "tid": 3420252, "ts": 6079338129176.901,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338129180.246, "dur": 0.561,
"args": {
"External id": 530, "cbid": 135, "correlation": 468
}
},
{
"ph": "f", "id": 468, "pid": 3416838, "tid": 3420252, "ts": 6079338129180.246,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338129183.041, "dur": 0.530,
"args": {
"External id": 530, "cbid": 135, "correlation": 473
}
},
{
"ph": "f", "id": 473, "pid": 3416838, "tid": 3420252, "ts": 6079338129183.041,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6079338246750.149, "dur": 7.744,
"args": {
"External id": 90, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 534, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 534, "pid": 0, "tid": 7, "ts": 6079338246750.149,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129435.671, "dur": 10.967,
"args": {
"External id": 90, "cbid": 307, "correlation": 534
}
},
{
"ph": "s", "id": 534, "pid": 3416838, "tid": 3416838, "ts": 6079338129435.671,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6079338246758.757, "dur": 1.440,
"args": {
"External id": 91, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 542, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 542, "pid": 0, "tid": 7, "ts": 6079338246758.757,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129461.611, "dur": 3.284,
"args": {
"External id": 91, "cbid": 307, "correlation": 542
}
},
{
"ph": "s", "id": 542, "pid": 3416838, "tid": 3416838, "ts": 6079338129461.611,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338246761.061, "dur": 48.256,
"args": {
"External id": 92, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 550, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 550, "pid": 0, "tid": 7, "ts": 6079338246761.061,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129478.736, "dur": 3.325,
"args": {
"External id": 92, "cbid": 307, "correlation": 550
}
},
{
"ph": "s", "id": 550, "pid": 3416838, "tid": 3416838, "ts": 6079338129478.736,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6079338246810.245, "dur": 1.664,
"args": {
"External id": 113, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 557, "pid": 0, "tid": 7, "ts": 6079338246810.245,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129537.174, "dur": 4.017,
"args": {
"External id": 113, "cbid": 307, "correlation": 557
}
},
{
"ph": "s", "id": 557, "pid": 3416838, "tid": 3416838, "ts": 6079338129537.174,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6079338246813.989, "dur": 31.488,
"args": {
"External id": 114, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 564, "pid": 0, "tid": 7, "ts": 6079338246813.989,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129552.447, "dur": 3.025,
"args": {
"External id": 114, "cbid": 307, "correlation": 564
}
},
{
"ph": "s", "id": 564, "pid": 3416838, "tid": 3416838, "ts": 6079338129552.447,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6079338246847.525, "dur": 1.632,
"args": {
"External id": 115, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 571, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 571, "pid": 0, "tid": 7, "ts": 6079338246847.525,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129565.798, "dur": 2.864,
"args": {
"External id": 115, "cbid": 307, "correlation": 571
}
},
{
"ph": "s", "id": 571, "pid": 3416838, "tid": 3416838, "ts": 6079338129565.798,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338129604.085, "dur": 0.501,
"args": {
"External id": 116, "cbid": 200, "correlation": 586
}
},
{
"ph": "f", "id": 586, "pid": 3416838, "tid": 3416838, "ts": 6079338129604.085,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338246850.021, "dur": 27093.853,
"args": {
"External id": 116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 588, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 588, "pid": 0, "tid": 7, "ts": 6079338246850.021,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129605.447, "dur": 3.776,
"args": {
"External id": 116, "cbid": 307, "correlation": 588
}
},
{
"ph": "s", "id": 588, "pid": 3416838, "tid": 3416838, "ts": 6079338129605.447,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6079338273944.802, "dur": 5612.358,
"args": {
"External id": 117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 609, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 609, "pid": 0, "tid": 7, "ts": 6079338273944.802,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129631.827, "dur": 3.716,
"args": {
"External id": 117, "cbid": 307, "correlation": 609
}
},
{
"ph": "s", "id": 609, "pid": 3416838, "tid": 3416838, "ts": 6079338129631.827,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338129653.941, "dur": 0.240,
"args": {
"External id": 118, "cbid": 200, "correlation": 622
}
},
{
"ph": "f", "id": 622, "pid": 3416838, "tid": 3416838, "ts": 6079338129653.941,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338279559.080, "dur": 19851.060,
"args": {
"External id": 118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 625, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 625, "pid": 0, "tid": 7, "ts": 6079338279559.080,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129655.212, "dur": 3.045,
"args": {
"External id": 118, "cbid": 307, "correlation": 625
}
},
{
"ph": "s", "id": 625, "pid": 3416838, "tid": 3416838, "ts": 6079338129655.212,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6079338299412.060, "dur": 1656.675,
"args": {
"External id": 119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 636, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 636, "pid": 0, "tid": 7, "ts": 6079338299412.060,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129674.652, "dur": 3.345,
"args": {
"External id": 119, "cbid": 307, "correlation": 636
}
},
{
"ph": "s", "id": 636, "pid": 3416838, "tid": 3416838, "ts": 6079338129674.652,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6079338301069.631, "dur": 75.552,
"args": {
"External id": 120, "device": 0, "context": 1, "stream": 7, "correlation": 643, "bytes": 77194752, "memory bandwidth (GB/s)": 1021.7433290978399
}
},
{
"ph": "f", "id": 643, "pid": 0, "tid": 7, "ts": 6079338301069.631,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 3416838, "tid": 3416838,
"ts": 6079338129691.798, "dur": 10.275,
"args": {
"External id": 120, "cbid": 41, "correlation": 643
}
},
{
"ph": "s", "id": 643, "pid": 3416838, "tid": 3416838, "ts": 6079338129691.798,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338129711.097, "dur": 0.290,
"args": {
"External id": 120, "cbid": 200, "correlation": 654
}
},
{
"ph": "f", "id": 654, "pid": 3416838, "tid": 3416838, "ts": 6079338129711.097,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6079338301147.231, "dur": 5839.910,
"args": {
"External id": 120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 657, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 657, "pid": 0, "tid": 7, "ts": 6079338301147.231,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129712.339, "dur": 3.234,
"args": {
"External id": 120, "cbid": 307, "correlation": 657
}
},
{
"ph": "s", "id": 657, "pid": 3416838, "tid": 3416838, "ts": 6079338129712.339,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6079338306989.189, "dur": 2.560,
"args": {
"External id": 121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 669, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 669, "pid": 0, "tid": 7, "ts": 6079338306989.189,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129740.922, "dur": 3.024,
"args": {
"External id": 121, "cbid": 307, "correlation": 669
}
},
{
"ph": "s", "id": 669, "pid": 3416838, "tid": 3416838, "ts": 6079338129740.922,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6079338306992.613, "dur": 1.696,
"args": {
"External id": 122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 674, "pid": 0, "tid": 7, "ts": 6079338306992.613,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129755.754, "dur": 3.495,
"args": {
"External id": 122, "cbid": 307, "correlation": 674
}
},
{
"ph": "s", "id": 674, "pid": 3416838, "tid": 3416838, "ts": 6079338129755.754,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6079338306995.173, "dur": 1.568,
"args": {
"External id": 126, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 685, "pid": 0, "tid": 7, "ts": 6079338306995.173,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338129824.688, "dur": 6.139,
"args": {
"External id": 126, "cbid": 211, "correlation": 685
}
},
{
"ph": "s", "id": 685, "pid": 3416838, "tid": 3416838, "ts": 6079338129824.688,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3416838,
"ts": 6079338129847.973, "dur": 1.072,
"args": {
"External id": 87, "cbid": 135, "correlation": 693
}
},
{
"ph": "f", "id": 693, "pid": 3416838, "tid": 3416838, "ts": 6079338129847.973,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6079338306997.669, "dur": 60.864,
"args": {
"External id": 549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 702, "pid": 0, "tid": 7, "ts": 6079338306997.669,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338129978.290, "dur": 9.654,
"args": {
"External id": 549, "cbid": 307, "correlation": 702
}
},
{
"ph": "s", "id": 702, "pid": 3416838, "tid": 3420252, "ts": 6079338129978.290,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6079338307060.549, "dur": 2.304,
"args": {
"External id": 550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 706, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 706, "pid": 0, "tid": 7, "ts": 6079338307060.549,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338129999.742, "dur": 4.356,
"args": {
"External id": 550, "cbid": 307, "correlation": 706
}
},
{
"ph": "s", "id": 706, "pid": 3416838, "tid": 3420252, "ts": 6079338129999.742,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338307063.653, "dur": 47.424,
"args": {
"External id": 551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 710, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 710, "pid": 0, "tid": 7, "ts": 6079338307063.653,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338130016.948, "dur": 2.864,
"args": {
"External id": 551, "cbid": 307, "correlation": 710
}
},
{
"ph": "s", "id": 710, "pid": 3416838, "tid": 3420252, "ts": 6079338130016.948,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130039.582, "dur": 1.102,
"args": {
"External id": 547, "cbid": 135, "correlation": 715
}
},
{
"ph": "f", "id": 715, "pid": 3416838, "tid": 3420252, "ts": 6079338130039.582,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130043.318, "dur": 0.380,
"args": {
"External id": 547, "cbid": 135, "correlation": 720
}
},
{
"ph": "f", "id": 720, "pid": 3416838, "tid": 3420252, "ts": 6079338130043.318,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130046.232, "dur": 0.361,
"args": {
"External id": 547, "cbid": 135, "correlation": 725
}
},
{
"ph": "f", "id": 725, "pid": 3416838, "tid": 3420252, "ts": 6079338130046.232,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6079338307111.973, "dur": 7.872,
"args": {
"External id": 132, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 786, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 786, "pid": 0, "tid": 7, "ts": 6079338307111.973,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130280.796, "dur": 11.567,
"args": {
"External id": 132, "cbid": 307, "correlation": 786
}
},
{
"ph": "s", "id": 786, "pid": 3416838, "tid": 3416838, "ts": 6079338130280.796,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6079338307121.925, "dur": 1.664,
"args": {
"External id": 133, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 794, "pid": 0, "tid": 7, "ts": 6079338307121.925,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130316.189, "dur": 4.206,
"args": {
"External id": 133, "cbid": 307, "correlation": 794
}
},
{
"ph": "s", "id": 794, "pid": 3416838, "tid": 3416838, "ts": 6079338130316.189,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338307124.549, "dur": 48.960,
"args": {
"External id": 134, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 802, "pid": 0, "tid": 7, "ts": 6079338307124.549,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130333.445, "dur": 3.015,
"args": {
"External id": 134, "cbid": 307, "correlation": 802
}
},
{
"ph": "s", "id": 802, "pid": 3416838, "tid": 3416838, "ts": 6079338130333.445,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6079338307174.373, "dur": 1.504,
"args": {
"External id": 155, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 809, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 809, "pid": 0, "tid": 7, "ts": 6079338307174.373,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130390.661, "dur": 4.287,
"args": {
"External id": 155, "cbid": 307, "correlation": 809
}
},
{
"ph": "s", "id": 809, "pid": 3416838, "tid": 3416838, "ts": 6079338130390.661,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6079338307176.677, "dur": 31.328,
"args": {
"External id": 156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 816, "pid": 0, "tid": 7, "ts": 6079338307176.677,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130405.714, "dur": 2.874,
"args": {
"External id": 156, "cbid": 307, "correlation": 816
}
},
{
"ph": "s", "id": 816, "pid": 3416838, "tid": 3416838, "ts": 6079338130405.714,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6079338307209.925, "dur": 1.536,
"args": {
"External id": 157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 823, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 823, "pid": 0, "tid": 7, "ts": 6079338307209.925,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130418.784, "dur": 2.383,
"args": {
"External id": 157, "cbid": 307, "correlation": 823
}
},
{
"ph": "s", "id": 823, "pid": 3416838, "tid": 3416838, "ts": 6079338130418.784,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338130457.422, "dur": 0.481,
"args": {
"External id": 158, "cbid": 200, "correlation": 838
}
},
{
"ph": "f", "id": 838, "pid": 3416838, "tid": 3416838, "ts": 6079338130457.422,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338307212.389, "dur": 27076.957,
"args": {
"External id": 158, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 840, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 840, "pid": 0, "tid": 7, "ts": 6079338307212.389,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130458.784, "dur": 3.846,
"args": {
"External id": 158, "cbid": 307, "correlation": 840
}
},
{
"ph": "s", "id": 840, "pid": 3416838, "tid": 3416838, "ts": 6079338130458.784,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6079338334290.242, "dur": 5609.222,
"args": {
"External id": 159, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 861, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 861, "pid": 0, "tid": 7, "ts": 6079338334290.242,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130488.459, "dur": 3.285,
"args": {
"External id": 159, "cbid": 307, "correlation": 861
}
},
{
"ph": "s", "id": 861, "pid": 3416838, "tid": 3416838, "ts": 6079338130488.459,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338130508.229, "dur": 0.230,
"args": {
"External id": 160, "cbid": 200, "correlation": 874
}
},
{
"ph": "f", "id": 874, "pid": 3416838, "tid": 3416838, "ts": 6079338130508.229,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338339900.328, "dur": 19794.837,
"args": {
"External id": 160, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 877, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 877, "pid": 0, "tid": 7, "ts": 6079338339900.328,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130509.390, "dur": 3.335,
"args": {
"External id": 160, "cbid": 307, "correlation": 877
}
},
{
"ph": "s", "id": 877, "pid": 3416838, "tid": 3416838, "ts": 6079338130509.390,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6079338359697.245, "dur": 1655.874,
"args": {
"External id": 161, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 888, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 888, "pid": 0, "tid": 7, "ts": 6079338359697.245,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130532.485, "dur": 2.744,
"args": {
"External id": 161, "cbid": 307, "correlation": 888
}
},
{
"ph": "s", "id": 888, "pid": 3416838, "tid": 3416838, "ts": 6079338130532.485,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6079338361355.167, "dur": 75.328,
"args": {
"External id": 162, "device": 0, "context": 1, "stream": 7, "correlation": 895, "bytes": 77194752, "memory bandwidth (GB/s)": 1024.7816482582837
}
},
{
"ph": "f", "id": 895, "pid": 0, "tid": 7, "ts": 6079338361355.167,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 3416838, "tid": 3416838,
"ts": 6079338130548.529, "dur": 12.059,
"args": {
"External id": 162, "cbid": 41, "correlation": 895
}
},
{
"ph": "s", "id": 895, "pid": 3416838, "tid": 3416838, "ts": 6079338130548.529,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338130570.192, "dur": 0.280,
"args": {
"External id": 162, "cbid": 200, "correlation": 906
}
},
{
"ph": "f", "id": 906, "pid": 3416838, "tid": 3416838, "ts": 6079338130570.192,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6079338361431.295, "dur": 5837.478,
"args": {
"External id": 162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 909, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 909, "pid": 0, "tid": 7, "ts": 6079338361431.295,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130571.364, "dur": 2.884,
"args": {
"External id": 162, "cbid": 307, "correlation": 909
}
},
{
"ph": "s", "id": 909, "pid": 3416838, "tid": 3416838, "ts": 6079338130571.364,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6079338367270.821, "dur": 2.656,
"args": {
"External id": 163, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 921, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 921, "pid": 0, "tid": 7, "ts": 6079338367270.821,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130599.386, "dur": 3.145,
"args": {
"External id": 163, "cbid": 307, "correlation": 921
}
},
{
"ph": "s", "id": 921, "pid": 3416838, "tid": 3416838, "ts": 6079338130599.386,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6079338367274.373, "dur": 1.728,
"args": {
"External id": 164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 926, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 926, "pid": 0, "tid": 7, "ts": 6079338367274.373,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130616.251, "dur": 2.685,
"args": {
"External id": 164, "cbid": 307, "correlation": 926
}
},
{
"ph": "s", "id": 926, "pid": 3416838, "tid": 3416838, "ts": 6079338130616.251,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6079338367276.965, "dur": 1.376,
"args": {
"External id": 168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 937, "pid": 0, "tid": 7, "ts": 6079338367276.965,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338130687.299, "dur": 5.989,
"args": {
"External id": 168, "cbid": 211, "correlation": 937
}
},
{
"ph": "s", "id": 937, "pid": 3416838, "tid": 3416838, "ts": 6079338130687.299,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3416838,
"ts": 6079338130710.253, "dur": 1.112,
"args": {
"External id": 129, "cbid": 135, "correlation": 945
}
},
{
"ph": "f", "id": 945, "pid": 3416838, "tid": 3416838, "ts": 6079338130710.253,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6079338367279.205, "dur": 60.832,
"args": {
"External id": 566, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 954, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 954, "pid": 0, "tid": 7, "ts": 6079338367279.205,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338130837.906, "dur": 9.244,
"args": {
"External id": 566, "cbid": 307, "correlation": 954
}
},
{
"ph": "s", "id": 954, "pid": 3416838, "tid": 3420252, "ts": 6079338130837.906,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6079338367342.085, "dur": 2.336,
"args": {
"External id": 567, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 958, "pid": 0, "tid": 7, "ts": 6079338367342.085,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338130858.316, "dur": 4.127,
"args": {
"External id": 567, "cbid": 307, "correlation": 958
}
},
{
"ph": "s", "id": 958, "pid": 3416838, "tid": 3420252, "ts": 6079338130858.316,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338367345.349, "dur": 47.872,
"args": {
"External id": 568, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 962, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 962, "pid": 0, "tid": 7, "ts": 6079338367345.349,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338130875.082, "dur": 3.645,
"args": {
"External id": 568, "cbid": 307, "correlation": 962
}
},
{
"ph": "s", "id": 962, "pid": 3416838, "tid": 3420252, "ts": 6079338130875.082,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130897.906, "dur": 0.841,
"args": {
"External id": 564, "cbid": 135, "correlation": 967
}
},
{
"ph": "f", "id": 967, "pid": 3416838, "tid": 3420252, "ts": 6079338130897.906,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130901.482, "dur": 0.560,
"args": {
"External id": 564, "cbid": 135, "correlation": 972
}
},
{
"ph": "f", "id": 972, "pid": 3416838, "tid": 3420252, "ts": 6079338130901.482,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338130904.466, "dur": 0.371,
"args": {
"External id": 564, "cbid": 135, "correlation": 977
}
},
{
"ph": "f", "id": 977, "pid": 3416838, "tid": 3420252, "ts": 6079338130904.466,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_backward_nll_loss_forward_0", "pid": 0, "tid": 7,
"ts": 6079338367394.085, "dur": 7.616,
"args": {
"External id": 174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1038, "registers per thread": 32, "shared memory": 16384, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1038, "pid": 0, "tid": 7, "ts": 6079338367394.085,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131141.523, "dur": 10.987,
"args": {
"External id": 174, "cbid": 307, "correlation": 1038
}
},
{
"ph": "s", "id": 1038, "pid": 3416838, "tid": 3416838, "ts": 6079338131141.523,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_1", "pid": 0, "tid": 7,
"ts": 6079338367402.661, "dur": 1.600,
"args": {
"External id": 175, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1046, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1046, "pid": 0, "tid": 7, "ts": 6079338367402.661,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131166.461, "dur": 3.115,
"args": {
"External id": 175, "cbid": 307, "correlation": 1046
}
},
{
"ph": "s", "id": 1046, "pid": 3416838, "tid": 3416838, "ts": 6079338131166.461,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338367406.245, "dur": 48.096,
"args": {
"External id": 176, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1054, "pid": 0, "tid": 7, "ts": 6079338367406.245,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131182.445, "dur": 3.676,
"args": {
"External id": 176, "cbid": 307, "correlation": 1054
}
},
{
"ph": "s", "id": 1054, "pid": 3416838, "tid": 3416838, "ts": 6079338131182.445,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_3", "pid": 0, "tid": 7,
"ts": 6079338367456.357, "dur": 1.504,
"args": {
"External id": 197, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1061, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.007576, "grid": [1, 1, 1], "block": [32, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1061, "pid": 0, "tid": 7, "ts": 6079338367456.357,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131243.277, "dur": 4.507,
"args": {
"External id": 197, "cbid": 307, "correlation": 1061
}
},
{
"ph": "s", "id": 1061, "pid": 3416838, "tid": 3416838, "ts": 6079338131243.277,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_4", "pid": 0, "tid": 7,
"ts": 6079338367458.661, "dur": 32.064,
"args": {
"External id": 198, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1068, "pid": 0, "tid": 7, "ts": 6079338367458.661,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131258.670, "dur": 3.155,
"args": {
"External id": 198, "cbid": 307, "correlation": 1068
}
},
{
"ph": "s", "id": 1068, "pid": 3416838, "tid": 3416838, "ts": 6079338131258.670,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_5", "pid": 0, "tid": 7,
"ts": 6079338367491.589, "dur": 1.536,
"args": {
"External id": 199, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1075, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.750000, "warps per SM": 3.000000, "grid": [99, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 5
}
},
{
"ph": "f", "id": 1075, "pid": 0, "tid": 7, "ts": 6079338367491.589,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131271.369, "dur": 2.344,
"args": {
"External id": 199, "cbid": 307, "correlation": 1075
}
},
{
"ph": "s", "id": 1075, "pid": 3416838, "tid": 3416838, "ts": 6079338131271.369,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338131320.674, "dur": 0.480,
"args": {
"External id": 200, "cbid": 200, "correlation": 1090
}
},
{
"ph": "f", "id": 1090, "pid": 3416838, "tid": 3416838, "ts": 6079338131320.674,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_128x128_tn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338367495.141, "dur": 27041.597,
"args": {
"External id": 200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1092, "registers per thread": 254, "shared memory": 32768, "blocks per SM": 775.757568, "warps per SM": 3103.030273, "grid": [2048, 50, 1], "block": [128, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 1092, "pid": 0, "tid": 7, "ts": 6079338367495.141,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131322.206, "dur": 4.136,
"args": {
"External id": 200, "cbid": 307, "correlation": 1092
}
},
{
"ph": "s", "id": 1092, "pid": 3416838, "tid": 3416838, "ts": 6079338131322.206,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused__log_softmax__log_softmax_backward_data_addmm_nll_loss_backward_nll_loss_forward_6", "pid": 0, "tid": 7,
"ts": 6079338394538.818, "dur": 5610.918,
"args": {
"External id": 201, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1113, "registers per thread": 48, "shared memory": 32, "blocks per SM": 248.242432, "warps per SM": 1985.939453, "grid": [32768, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 63
}
},
{
"ph": "f", "id": 1113, "pid": 0, "tid": 7, "ts": 6079338394538.818,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131351.310, "dur": 3.395,
"args": {
"External id": 201, "cbid": 307, "correlation": 1113
}
},
{
"ph": "s", "id": 1113, "pid": 3416838, "tid": 3416838, "ts": 6079338131351.310,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338131370.799, "dur": 0.210,
"args": {
"External id": 202, "cbid": 200, "correlation": 1126
}
},
{
"ph": "f", "id": 1126, "pid": 3416838, "tid": 3416838, "ts": 6079338131370.799,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1>(cutlass_75_tensorop_bf16_s1688gemm_bf16_256x128_nn_align1::Params)", "pid": 0, "tid": 7,
"ts": 6079338400150.600, "dur": 19836.597,
"args": {
"External id": 202, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1129, "registers per thread": 229, "shared memory": 49152, "blocks per SM": 7.757576, "warps per SM": 62.060608, "grid": [1024, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 13
}
},
{
"ph": "f", "id": 1129, "pid": 0, "tid": 7, "ts": 6079338400150.600,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131371.971, "dur": 2.874,
"args": {
"External id": 202, "cbid": 307, "correlation": 1129
}
},
{
"ph": "s", "id": 1129, "pid": 3416838, "tid": 3416838, "ts": 6079338131371.971,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_sum_7", "pid": 0, "tid": 7,
"ts": 6079338419988.029, "dur": 1657.346,
"args": {
"External id": 203, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1140, "registers per thread": 40, "shared memory": 4096, "blocks per SM": 5.954545, "warps per SM": 95.272728, "grid": [786, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 75
}
},
{
"ph": "f", "id": 1140, "pid": 0, "tid": 7, "ts": 6079338419988.029,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131391.310, "dur": 2.965,
"args": {
"External id": 203, "cbid": 307, "correlation": 1140
}
},
{
"ph": "s", "id": 1140, "pid": 3416838, "tid": 3416838, "ts": 6079338131391.310,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7,
"ts": 6079338421647.391, "dur": 76.704,
"args": {
"External id": 204, "device": 0, "context": 1, "stream": 7, "correlation": 1147, "bytes": 77194752, "memory bandwidth (GB/s)": 1006.3979974968711
}
},
{
"ph": "f", "id": 1147, "pid": 0, "tid": 7, "ts": 6079338421647.391,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 3416838, "tid": 3416838,
"ts": 6079338131408.266, "dur": 10.495,
"args": {
"External id": 204, "cbid": 41, "correlation": 1147
}
},
{
"ph": "s", "id": 1147, "pid": 3416838, "tid": 3416838, "ts": 6079338131408.266,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 3416838, "tid": 3416838,
"ts": 6079338131427.965, "dur": 0.241,
"args": {
"External id": 204, "cbid": 200, "correlation": 1158
}
},
{
"ph": "f", "id": 1158, "pid": 3416838, "tid": 3416838, "ts": 6079338131427.965,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2<cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8>(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_256x128_32x6_nt_align8::Params)", "pid": 0, "tid": 7,
"ts": 6079338421724.895, "dur": 5828.870,
"args": {
"External id": 204, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1161, "registers per thread": 216, "shared memory": 147456, "blocks per SM": 11.909091, "warps per SM": 95.272728, "grid": [1572, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1161, "pid": 0, "tid": 7, "ts": 6079338421724.895,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131428.967, "dur": 2.984,
"args": {
"External id": 204, "cbid": 307, "correlation": 1161
}
},
{
"ph": "s", "id": 1161, "pid": 3416838, "tid": 3416838, "ts": 6079338131428.967,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_red_fused_nll_loss_forward_8", "pid": 0, "tid": 7,
"ts": 6079338427554.757, "dur": 2.688,
"args": {
"External id": 205, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1173, "registers per thread": 26, "shared memory": 64, "blocks per SM": 0.030303, "warps per SM": 0.484848, "grid": [4, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 1
}
},
{
"ph": "f", "id": 1173, "pid": 0, "tid": 7, "ts": 6079338427554.757,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131456.108, "dur": 3.345,
"args": {
"External id": 205, "cbid": 307, "correlation": 1173
}
},
{
"ph": "s", "id": 1173, "pid": 3416838, "tid": 3416838, "ts": 6079338131456.108,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_per_fused_nll_loss_forward_9", "pid": 0, "tid": 7,
"ts": 6079338427559.493, "dur": 1.984,
"args": {
"External id": 206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.015152, "grid": [1, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1178, "pid": 0, "tid": 7, "ts": 6079338427559.493,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131471.801, "dur": 2.965,
"args": {
"External id": 206, "cbid": 307, "correlation": 1178
}
},
{
"ph": "s", "id": 1178, "pid": 3416838, "tid": 3416838, "ts": 6079338131471.801,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<8, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul> >(int, at::native::FillFunctor<c10::BFloat16>, std::array<char*, 1ul>)", "pid": 0, "tid": 7,
"ts": 6079338427562.437, "dur": 1.376,
"args": {
"External id": 210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1189, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007576, "warps per SM": 0.030303, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0
}
},
{
"ph": "f", "id": 1189, "pid": 0, "tid": 7, "ts": 6079338427562.437,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 3416838, "tid": 3416838,
"ts": 6079338131540.946, "dur": 5.147,
"args": {
"External id": 210, "cbid": 211, "correlation": 1189
}
},
{
"ph": "s", "id": 1189, "pid": 3416838, "tid": 3416838, "ts": 6079338131540.946,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3416838,
"ts": 6079338131562.879, "dur": 1.061,
"args": {
"External id": 171, "cbid": 135, "correlation": 1197
}
},
{
"ph": "f", "id": 1197, "pid": 3416838, "tid": 3416838, "ts": 6079338131562.879,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_0", "pid": 0, "tid": 7,
"ts": 6079338427564.741, "dur": 60.960,
"args": {
"External id": 583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 285.553040, "warps per SM": 1142.212158, "grid": [37693, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1206, "pid": 0, "tid": 7, "ts": 6079338427564.741,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338131687.687, "dur": 9.735,
"args": {
"External id": 583, "cbid": 307, "correlation": 1206
}
},
{
"ph": "s", "id": 1206, "pid": 3416838, "tid": 3420252, "ts": 6079338131687.687,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_1", "pid": 0, "tid": 7,
"ts": 6079338427626.533, "dur": 2.336,
"args": {
"External id": 584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1210, "registers per thread": 16, "shared memory": 0, "blocks per SM": 1.492424, "warps per SM": 5.969697, "grid": [197, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 9
}
},
{
"ph": "f", "id": 1210, "pid": 0, "tid": 7, "ts": 6079338427626.533,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338131709.330, "dur": 3.475,
"args": {
"External id": 584, "cbid": 307, "correlation": 1210
}
},
{
"ph": "s", "id": 1210, "pid": 3416838, "tid": 3420252, "ts": 6079338131709.330,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "kernel", "name": "triton_poi_fused_mul_2", "pid": 0, "tid": 7,
"ts": 6079338427629.669, "dur": 47.424,
"args": {
"External id": 585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 1214, "registers per thread": 22, "shared memory": 0, "blocks per SM": 186.181824, "warps per SM": 744.727295, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100
}
},
{
"ph": "f", "id": 1214, "pid": 0, "tid": 7, "ts": 6079338427629.669,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 3416838, "tid": 3420252,
"ts": 6079338131725.344, "dur": 3.575,
"args": {
"External id": 585, "cbid": 307, "correlation": 1214
}
},
{
"ph": "s", "id": 1214, "pid": 3416838, "tid": 3420252, "ts": 6079338131725.344,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338131747.637, "dur": 0.812,
"args": {
"External id": 581, "cbid": 135, "correlation": 1219
}
},
{
"ph": "f", "id": 1219, "pid": 3416838, "tid": 3420252, "ts": 6079338131747.637,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338131750.862, "dur": 0.391,
"args": {
"External id": 581, "cbid": 135, "correlation": 1224
}
},
{
"ph": "f", "id": 1224, "pid": 3416838, "tid": 3420252, "ts": 6079338131750.862,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 3416838, "tid": 3420252,
"ts": 6079338131753.316, "dur": 0.340,
"args": {
"External id": 581, "cbid": 135, "correlation": 1229
}
},
{
"ph": "f", "id": 1229, "pid": 3416838, "tid": 3420252, "ts": 6079338131753.316,
"cat": "ac2g", "name": "ac2g", "bp": "e"
},
{
"ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 3416838, "tid": 3416838,
"ts": 6079338131898.395, "dur": 295803.100,
"args": {
"cbid": 165, "correlation": 1263
}
},
{
"ph": "s", "id": 1263, "pid": 3416838, "tid": 3416838, "ts": 6079338131898.395,
"cat": "ac2g", "name": "ac2g"
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 4", "pid": 0, "tid": 7,
"ts": 6079338367394.084, "dur": 60169.730,
"args": {
"External id": 169
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 3", "pid": 0, "tid": 7,
"ts": 6079338307111.972, "dur": 60166.370,
"args": {
"External id": 127
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 2", "pid": 0, "tid": 7,
"ts": 6079338246750.148, "dur": 60246.594,
"args": {
"External id": 85
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 1", "pid": 0, "tid": 7,
"ts": 6079338186608.484, "dur": 60025.762,
"args": {
"External id": 43
}
},
{
"ph": "X", "cat": "gpu_user_annotation", "name": "Step 0", "pid": 0, "tid": 7,
"ts": 6079338126554.436, "dur": 59937.666,
"args": {
"External id": 1
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 0,
"args": {
"labels": "CPU"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 0,
"args": {
"sort_index": 3416838
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 0, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 0, "tid": 0,
"args": {
"labels": "GPU 0"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 0, "tid": 0,
"args": {
"sort_index": 5000000
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 1, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 1, "tid": 0,
"args": {
"labels": "GPU 1"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 1, "tid": 0,
"args": {
"sort_index": 5000001
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 2, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 2, "tid": 0,
"args": {
"labels": "GPU 2"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 2, "tid": 0,
"args": {
"sort_index": 5000002
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 3, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 3, "tid": 0,
"args": {
"labels": "GPU 3"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 3, "tid": 0,
"args": {
"sort_index": 5000003
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 4, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 4, "tid": 0,
"args": {
"labels": "GPU 4"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 4, "tid": 0,
"args": {
"sort_index": 5000004
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 5, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 5, "tid": 0,
"args": {
"labels": "GPU 5"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 5, "tid": 0,
"args": {
"sort_index": 5000005
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 6, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 6, "tid": 0,
"args": {
"labels": "GPU 6"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 6, "tid": 0,
"args": {
"sort_index": 5000006
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 7, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 7, "tid": 0,
"args": {
"labels": "GPU 7"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 7, "tid": 0,
"args": {
"sort_index": 5000007
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 8, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 8, "tid": 0,
"args": {
"labels": "GPU 8"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 8, "tid": 0,
"args": {
"sort_index": 5000008
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 9, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 9, "tid": 0,
"args": {
"labels": "GPU 9"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 9, "tid": 0,
"args": {
"sort_index": 5000009
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 10, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 10, "tid": 0,
"args": {
"labels": "GPU 10"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 10, "tid": 0,
"args": {
"sort_index": 5000010
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 11, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 11, "tid": 0,
"args": {
"labels": "GPU 11"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 11, "tid": 0,
"args": {
"sort_index": 5000011
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 12, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 12, "tid": 0,
"args": {
"labels": "GPU 12"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 12, "tid": 0,
"args": {
"sort_index": 5000012
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 13, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 13, "tid": 0,
"args": {
"labels": "GPU 13"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 13, "tid": 0,
"args": {
"sort_index": 5000013
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 14, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 14, "tid": 0,
"args": {
"labels": "GPU 14"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 14, "tid": 0,
"args": {
"sort_index": 5000014
}
},
{
"name": "process_name", "ph": "M", "ts": 6079338125113.712, "pid": 15, "tid": 0,
"args": {
"name": "python"
}
},
{
"name": "process_labels", "ph": "M", "ts": 6079338125113.712, "pid": 15, "tid": 0,
"args": {
"labels": "GPU 15"
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 15, "tid": 0,
"args": {
"sort_index": 5000015
}
},
{
"name": "thread_name", "ph": "M", "ts": 6079338125113.712, "pid": 0, "tid": 7,
"args": {
"name": "stream 7 "
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 0, "tid": 7,
"args": {
"sort_index": 7
}
},
{
"name": "thread_name", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3416838,
"args": {
"name": "thread 3416838 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3416838,
"args": {
"sort_index": 3416838
}
},
{
"name": "thread_name", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3420252,
"args": {
"name": "thread 3420252 (python)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3420252,
"args": {
"sort_index": 3420252
}
},
{
"name": "thread_name", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3420252,
"args": {
"name": "thread 3420252 (pt_autograd_0)"
}
},
{
"name": "thread_sort_index", "ph": "M", "ts": 6079338125113.712, "pid": 3416838, "tid": 3420252,
"args": {
"sort_index": 3420252
}
},
{
"ph": "X", "cat": "Trace", "ts": 6079338125061.844, "dur": 302707.341,
"pid": "Spans", "tid": "PyTorch Profiler",
"name": "PyTorch Profiler (0)",
"args": {
"Op count": 0
}
},
{
"name": "process_sort_index", "ph": "M", "ts": 6079338125061.844,
"pid": "Spans", "tid": 0,
"args": {
"sort_index": 536870912
}
},
{
"name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g",
"pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6079338125061.844
},
{
"name": "Record Window End", "ph": "i", "s": "g",
"pid": "", "tid": "", "ts": 6079338428120.755
}
],
"traceName": "/tmp/trace.json"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment