Skip to content

Instantly share code, notes, and snippets.

@fishmingyu
Last active December 8, 2023 10:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fishmingyu/cc339f22869b8ad9a9ffe74aed1c8e22 to your computer and use it in GitHub Desktop.
Save fishmingyu/cc339f22869b8ad9a9ffe74aed1c8e22 to your computer and use it in GitHub Desktop.
GNN breakdown profiling [PT2.0 compiler]
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1
graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1
graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1
aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3
aten::scatter_reduce_ 4.85% 2.001ms 10.78% 4.443ms 1.481ms 3
aten::mm 1.38% 571.000us 1.38% 571.000us 190.333us 3
CompiledFunction 0.94% 389.000us 99.34% 40.957ms 40.957ms 1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_ne... 0.46% 189.000us 0.46% 189.000us 189.000us 1
ProfilerStep* 0.42% 173.000us 100.00% 41.231ms 41.231ms 1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc... 0.37% 152.000us 0.37% 152.000us 152.000us 1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc... 0.25% 102.000us 0.25% 102.000us 102.000us 1
TorchDynamo Cache Lookup 0.23% 94.000us 0.23% 94.000us 94.000us 1
aten::empty_strided 0.03% 11.000us 0.03% 11.000us 0.458us 24
inductor::_reinterpret_tensor 0.01% 5.000us 0.01% 5.000us 0.294us 17
detach 0.01% 4.000us 0.01% 4.000us 2.000us 2
aten::detach 0.01% 3.000us 0.02% 7.000us 3.500us 2
aten::resolve_conj 0.00% 0.000us 0.00% 0.000us 0.000us 6
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
graph_0_cpp_fused_add_clone_eq_index_index_select_ma... 31.86% 10.464ms 31.86% 10.464ms 10.464ms 1
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc... 31.03% 10.190ms 31.03% 10.190ms 10.190ms 1
graph_0_cpp_fused_add_index_select_mul_scatter_add_3... 30.51% 10.021ms 30.51% 10.021ms 10.021ms 1
graph_0_cpp_fused_new_zeros_ones_scatter_add_0 3.85% 1.265ms 3.85% 1.265ms 1.265ms 1
aten::mm 1.33% 437.000us 1.33% 437.000us 145.667us 3
CompiledFunction 0.65% 214.000us 99.29% 32.608ms 32.608ms 1
ProfilerStep* 0.45% 149.000us 100.00% 32.842ms 32.842ms 1
TorchDynamo Cache Lookup 0.23% 77.000us 0.23% 77.000us 77.000us 1
aten::empty_strided 0.03% 11.000us 0.03% 11.000us 1.100us 10
detach 0.02% 6.000us 0.02% 6.000us 3.000us 2
inductor::_reinterpret_tensor 0.02% 6.000us 0.02% 6.000us 0.857us 7
aten::detach 0.01% 2.000us 0.02% 8.000us 4.000us 2
aten::resolve_conj 0.00% 0.000us 0.00% 0.000us 0.000us 6
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc... 33.41% 11.279ms 33.41% 11.279ms 11.279ms 1
graph_0_cpp_fused_add_index_select_mul_new_zeros_sca... 30.71% 10.366ms 30.71% 10.366ms 10.366ms 1
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc... 30.47% 10.286ms 30.47% 10.286ms 10.286ms 1
aten::addmm 2.23% 753.000us 2.75% 929.000us 154.833us 6
CompiledFunction 0.89% 302.000us 99.14% 33.466ms 33.466ms 1
graph_0_cpp_fused_clone_relu_threshold_backward_1 0.53% 178.000us 0.53% 178.000us 178.000us 1
ProfilerStep* 0.52% 177.000us 100.00% 33.755ms 33.755ms 1
aten::copy_ 0.49% 165.000us 0.49% 165.000us 27.500us 6
TorchDynamo Cache Lookup 0.30% 101.000us 0.30% 101.000us 101.000us 1
graph_0_cpp_fused_clone_relu_threshold_backward_3 0.16% 54.000us 0.16% 54.000us 54.000us 1
graph_0_cpp_fused_clone_relu_threshold_backward_5 0.16% 54.000us 0.16% 54.000us 54.000us 1
aten::empty_strided 0.04% 12.000us 0.04% 12.000us 0.923us 13
aten::expand 0.03% 9.000us 0.03% 11.000us 1.833us 6
aten::detach 0.02% 6.000us 0.03% 11.000us 2.200us 5
inductor::_reinterpret_tensor 0.02% 6.000us 0.02% 6.000us 0.462us 13
detach 0.01% 5.000us 0.01% 5.000us 1.000us 5
aten::as_strided 0.01% 2.000us 0.01% 2.000us 0.333us 6
aten::resolve_conj 0.00% 0.000us 0.00% 0.000us 0.000us 12
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
import os
import os.path as osp
import warnings
import pytest
import torch
import torch.nn.functional as F
import logging
import torch.fx
import torch_geometric
import torch_geometric.typing
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.nn.models import GAT, GCN, GIN, PNA, EdgeCNN, GraphSAGE
from torch_geometric.profile import benchmark
from torch_geometric.testing import (
disableExtensions,
onlyFullTest,
onlyLinux,
withCUDA,
withPackage,
)
from torch._inductor import config
config.cpp.enable_kernel_profile = True
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--device", type=str, default="cpu")
parser.add_argument("--backward", action="store_true")
args = parser.parse_args()
kwargs = {}
num_nodes, num_edges = 10_000, 200_000
x = torch.randn(num_nodes, 64, device=args.device)
edge_index = torch.randint(num_nodes, (2, num_edges), device=args.device)
kwargs["add_self_loops"] = False
model = GCN(64, 64, num_layers=3, **kwargs).to(args.device)
compiled_model = torch_geometric.compile(model, backend="inductor")
compiled_model(x, edge_index)
from torch.profiler import profile, schedule, ProfilerActivity
RESULT_DIR = "./prof_trace"
my_schedule = schedule(
skip_first=10,
wait=5,
warmup=5,
active=1,
repeat=5)
def trace_handler(p):
output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=20)
print(output)
p.export_chrome_trace(f"{RESULT_DIR}/{p.step_num}.json")
total = 0
with profile(
activities=[ProfilerActivity.CPU],
schedule=my_schedule,
on_trace_ready=trace_handler
) as p:
for _ in range(50):
compiled_model(x, edge_index)
p.step()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment