fishmingyu/GAT breakdown

## GAT breakdown
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
graph_0_cpp_fused_add_exp_index_select_mul_scatter_a...        29.02%      11.966ms        29.02%      11.966ms      11.966ms             1
graph_0_cpp_fused_add_clone_exp_index_select_mul_rel...        28.60%      11.794ms        28.60%      11.794ms      11.794ms             1
graph_0_cpp_fused_add_clone_exp_index_select_mul_new...        27.49%      11.335ms        27.49%      11.335ms      11.335ms             1
                                         aten::scatter_         5.92%       2.442ms         5.92%       2.442ms     814.000us             3
                                  aten::scatter_reduce_         4.85%       2.001ms        10.78%       4.443ms       1.481ms             3
                                               aten::mm         1.38%     571.000us         1.38%     571.000us     190.333us             3
                                       CompiledFunction         0.94%     389.000us        99.34%      40.957ms      40.957ms             1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_ne...         0.46%     189.000us         0.46%     189.000us     189.000us             1
                                          ProfilerStep*         0.42%     173.000us       100.00%      41.231ms      41.231ms             1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc...         0.37%     152.000us         0.37%     152.000us     152.000us             1
graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc...         0.25%     102.000us         0.25%     102.000us     102.000us             1
                               TorchDynamo Cache Lookup         0.23%      94.000us         0.23%      94.000us      94.000us             1
                                    aten::empty_strided         0.03%      11.000us         0.03%      11.000us       0.458us            24
                          inductor::_reinterpret_tensor         0.01%       5.000us         0.01%       5.000us       0.294us            17
                                                 detach         0.01%       4.000us         0.01%       4.000us       2.000us             2
                                           aten::detach         0.01%       3.000us         0.02%       7.000us       3.500us             2
                                     aten::resolve_conj         0.00%       0.000us         0.00%       0.000us       0.000us             6
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------

## GCN breakdown
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
graph_0_cpp_fused_add_clone_eq_index_index_select_ma...        31.86%      10.464ms        31.86%      10.464ms      10.464ms             1
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc...        31.03%      10.190ms        31.03%      10.190ms      10.190ms             1
graph_0_cpp_fused_add_index_select_mul_scatter_add_3...        30.51%      10.021ms        30.51%      10.021ms      10.021ms             1
         graph_0_cpp_fused_new_zeros_ones_scatter_add_0         3.85%       1.265ms         3.85%       1.265ms       1.265ms             1
                                               aten::mm         1.33%     437.000us         1.33%     437.000us     145.667us             3
                                       CompiledFunction         0.65%     214.000us        99.29%      32.608ms      32.608ms             1
                                          ProfilerStep*         0.45%     149.000us       100.00%      32.842ms      32.842ms             1
                               TorchDynamo Cache Lookup         0.23%      77.000us         0.23%      77.000us      77.000us             1
                                    aten::empty_strided         0.03%      11.000us         0.03%      11.000us       1.100us            10
                                                 detach         0.02%       6.000us         0.02%       6.000us       3.000us             2
                          inductor::_reinterpret_tensor         0.02%       6.000us         0.02%       6.000us       0.857us             7
                                           aten::detach         0.01%       2.000us         0.02%       8.000us       4.000us             2
                                     aten::resolve_conj         0.00%       0.000us         0.00%       0.000us       0.000us             6
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------

## GIN breakdown
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc...        33.41%      11.279ms        33.41%      11.279ms      11.279ms             1
graph_0_cpp_fused_add_index_select_mul_new_zeros_sca...        30.71%      10.366ms        30.71%      10.366ms      10.366ms             1
graph_0_cpp_fused_add_clone_index_select_mul_relu_sc...        30.47%      10.286ms        30.47%      10.286ms      10.286ms             1
                                            aten::addmm         2.23%     753.000us         2.75%     929.000us     154.833us             6
                                       CompiledFunction         0.89%     302.000us        99.14%      33.466ms      33.466ms             1
      graph_0_cpp_fused_clone_relu_threshold_backward_1         0.53%     178.000us         0.53%     178.000us     178.000us             1
                                          ProfilerStep*         0.52%     177.000us       100.00%      33.755ms      33.755ms             1
                                            aten::copy_         0.49%     165.000us         0.49%     165.000us      27.500us             6
                               TorchDynamo Cache Lookup         0.30%     101.000us         0.30%     101.000us     101.000us             1
      graph_0_cpp_fused_clone_relu_threshold_backward_3         0.16%      54.000us         0.16%      54.000us      54.000us             1
      graph_0_cpp_fused_clone_relu_threshold_backward_5         0.16%      54.000us         0.16%      54.000us      54.000us             1
                                    aten::empty_strided         0.04%      12.000us         0.04%      12.000us       0.923us            13
                                           aten::expand         0.03%       9.000us         0.03%      11.000us       1.833us             6
                                           aten::detach         0.02%       6.000us         0.03%      11.000us       2.200us             5
                          inductor::_reinterpret_tensor         0.02%       6.000us         0.02%       6.000us       0.462us            13
                                                 detach         0.01%       5.000us         0.01%       5.000us       1.000us             5
                                       aten::as_strided         0.01%       2.000us         0.01%       2.000us       0.333us             6
                                     aten::resolve_conj         0.00%       0.000us         0.00%       0.000us       0.000us            12
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------

## profile.py
import os
import os.path as osp
import warnings

import pytest
import torch
import torch.nn.functional as F
import logging

import torch.fx
import torch_geometric
import torch_geometric.typing
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.nn.models import GAT, GCN, GIN, PNA, EdgeCNN, GraphSAGE
from torch_geometric.profile import benchmark
from torch_geometric.testing import (
    disableExtensions,
    onlyFullTest,
    onlyLinux,
    withCUDA,
    withPackage,
)

from torch._inductor import config
config.cpp.enable_kernel_profile = True

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--device", type=str, default="cpu")
    parser.add_argument("--backward", action="store_true")
    args = parser.parse_args()

    kwargs = {}
    num_nodes, num_edges = 10_000, 200_000
    x = torch.randn(num_nodes, 64, device=args.device)
    edge_index = torch.randint(num_nodes, (2, num_edges), device=args.device)
    kwargs["add_self_loops"] = False
    model = GCN(64, 64, num_layers=3, **kwargs).to(args.device)

    compiled_model = torch_geometric.compile(model, backend="inductor")
    compiled_model(x, edge_index)


    from torch.profiler import profile, schedule, ProfilerActivity
    RESULT_DIR = "./prof_trace"
    my_schedule = schedule(
        skip_first=10,
        wait=5,
        warmup=5,
        active=1,
        repeat=5)

    def trace_handler(p):
        output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=20)
        print(output)
        p.export_chrome_trace(f"{RESULT_DIR}/{p.step_num}.json")

    total = 0
    with profile(
        activities=[ProfilerActivity.CPU],
        schedule=my_schedule,
        on_trace_ready=trace_handler
    ) as p:
        for _ in range(50):
            compiled_model(x, edge_index)
            p.step()
	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1
	aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3
	aten::scatter_reduce_ 4.85% 2.001ms 10.78% 4.443ms 1.481ms 3
	aten::mm 1.38% 571.000us 1.38% 571.000us 190.333us 3
	CompiledFunction 0.94% 389.000us 99.34% 40.957ms 40.957ms 1
	graph_0_cpp_fused_add_index_select_leaky_relu_mul_ne... 0.46% 189.000us 0.46% 189.000us 189.000us 1
	ProfilerStep* 0.42% 173.000us 100.00% 41.231ms 41.231ms 1
	graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc... 0.37% 152.000us 0.37% 152.000us 152.000us 1
	graph_0_cpp_fused_add_index_select_leaky_relu_mul_sc... 0.25% 102.000us 0.25% 102.000us 102.000us 1
	TorchDynamo Cache Lookup 0.23% 94.000us 0.23% 94.000us 94.000us 1
	aten::empty_strided 0.03% 11.000us 0.03% 11.000us 0.458us 24
	inductor::_reinterpret_tensor 0.01% 5.000us 0.01% 5.000us 0.294us 17
	detach 0.01% 4.000us 0.01% 4.000us 2.000us 2
	aten::detach 0.01% 3.000us 0.02% 7.000us 3.500us 2
	aten::resolve_conj 0.00% 0.000us 0.00% 0.000us 0.000us 6
	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	import os
	import os.path as osp
	import warnings

	import pytest
	import torch
	import torch.nn.functional as F
	import logging

	import torch.fx
	import torch_geometric
	import torch_geometric.typing
	from torch_geometric.data import Data
	from torch_geometric.loader import NeighborLoader
	from torch_geometric.nn import SAGEConv
	from torch_geometric.nn.models import GAT, GCN, GIN, PNA, EdgeCNN, GraphSAGE
	from torch_geometric.profile import benchmark
	from torch_geometric.testing import (
	disableExtensions,
	onlyFullTest,
	onlyLinux,
	withCUDA,
	withPackage,
	)

	from torch._inductor import config
	config.cpp.enable_kernel_profile = True

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument("--device", type=str, default="cpu")
	parser.add_argument("--backward", action="store_true")
	args = parser.parse_args()

	kwargs = {}
	num_nodes, num_edges = 10_000, 200_000
	x = torch.randn(num_nodes, 64, device=args.device)
	edge_index = torch.randint(num_nodes, (2, num_edges), device=args.device)
	kwargs["add_self_loops"] = False
	model = GCN(64, 64, num_layers=3, **kwargs).to(args.device)

	compiled_model = torch_geometric.compile(model, backend="inductor")
	compiled_model(x, edge_index)


	from torch.profiler import profile, schedule, ProfilerActivity
	RESULT_DIR = "./prof_trace"
	my_schedule = schedule(
	skip_first=10,
	wait=5,
	warmup=5,
	active=1,
	repeat=5)

	def trace_handler(p):
	output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=20)
	print(output)
	p.export_chrome_trace(f"{RESULT_DIR}/{p.step_num}.json")

	total = 0
	with profile(
	activities=[ProfilerActivity.CPU],
	schedule=my_schedule,
	on_trace_ready=trace_handler
	) as p:
	for _ in range(50):
	compiled_model(x, edge_index)
	p.step()