leslie-fang-intel

## gist:c8107b9af8b25d7887ea10a8738b2277
The running script as:
```
import torch
import torch.nn as nn
import torch._dynamo as torchdynamo
import copy

class Mod(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

## Example guard
guards is: {
Guard(name='self', source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBuilder.NN_MODULE at 0x7f01161ab160>, is_volatile=False, guard_types=['ID_MATCH'], code_list=['___check_obj_id(self, 139643301645088)'], obj_weakref=<weakref at 0x7f00fa06a180; to 'Mod' at 0x7f013d63cb20>, guarded_class_weakref=<weakref at 0x7f01362b6360; to 'type' at 0x5572acee40e0 (Mod)>),
Guard(name='x', source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBuilder.TENSOR_MATCH at 0x7f01161ab8b0>, is_volatile=False, guard_types=['TENSOR_MATCH'], code_list=None, obj_weakref=<weakref at 0x7f0116fac2c0; to 'Tensor' at 0x7f0116ed6810>, guarded_class_weakref=<weakref at 0x7f0119143900; to 'torch._C._TensorMeta' at 0x5572ab714830 (Tensor)>),
Guard(name='self.linear', source=<GuardSource.LOCAL_NN_MODULE: 2>, create_fn=<function GuardBuilder.NN_MODULE at 0x7f01161ab160>, is_volatile=False, guard_types=None, code_list=None, obj_weakref=None, guarded_class_weakref=None),
Guard(name='torch', source=<GuardSource.GLOBAL: 1>, cr

## Int8 Inductor RN50 Profiling
-----------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                                           Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
-----------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
        quantized::conv_relu_int8_packed_weight        13.77%      69.356ms        20.19%     101.737ms       3.083ms            33
             quantized::conv_int8_packed_weight        13.59%      68.488ms        19.10%      96.245ms       4.812ms            20
                           graph_1_kernel_cpp_1         9.96%      50.175ms         9.96%      50.175ms      50.175ms             1
                           graph_1_kernel_cpp_2         9.32%      46.958ms         9.32%      46.958ms      46.958ms             1
                           graph_1_kernel_cpp_3         6.54%      32.936ms         6.54%

## Code gen of decomposed quant
kernel_cpp_0 = async_compile.cpp('''
#include <ATen/record_function.h>
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       const float* __restrict__ in_ptr1,
                       const long* __restrict__ in_ptr2,
                       unsigned char* __restrict__ out_ptr0)
{
    RECORD_FUNCTION("graph_1_kernel_cpp_0", c10::ArrayRef<c10::IValue>({}));
    #pragma omp parallel num_threads(28)

## Export with if else.py
import torch
import torch._dynamo as torchdynamo
import copy

class Mod(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.relu = torch.nn.ReLU()

    def forward(self, x):

## RN50 int8 inductor code gen.log
from ctypes import c_void_p, c_long
import torch
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()

## conv_relu_case.py
import torch
import torch._dynamo as torchdynamo
from torch.ao.quantization import (
    get_default_qconfig,
    QConfigMapping,
)
from torch.ao.quantization._quantize_pt2e import prepare_pt2e, convert_pt2e
from torch._inductor.compile_fx import compile_fx

def test_single_conv():

## gist:f8b9df5aefdf72f2111d5237fb178ff0
from ctypes import c_void_p, c_long
import torch
import math
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride

## Test Dynamo Export and Compile FX.py
# Running CMD is: clear &&  TORCHDYNAMO_DYNAMIC_SHAPES=1 python test_rn50.py 2>&1 | tee test.log

import torch
import torch._dynamo as torchdynamo
import torchvision.models as models
import copy
from torch._inductor.compile_fx import compile_fx

import logging
torch._dynamo.config.log_level = logging.DEBUG

## Generated code.py
from ctypes import c_void_p, c_long
import torch
import math
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	The running script as:
	```
	import torch
	import torch.nn as nn
	import torch._dynamo as torchdynamo
	import copy

	class Mod(torch.nn.Module):
	def __init__(self) -> None:
	super().__init__()
	guards is: {
	Guard(name='self', source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBuilder.NN_MODULE at 0x7f01161ab160>, is_volatile=False, guard_types=['ID_MATCH'], code_list=['___check_obj_id(self, 139643301645088)'], obj_weakref=<weakref at 0x7f00fa06a180; to 'Mod' at 0x7f013d63cb20>, guarded_class_weakref=<weakref at 0x7f01362b6360; to 'type' at 0x5572acee40e0 (Mod)>),
	Guard(name='x', source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBuilder.TENSOR_MATCH at 0x7f01161ab8b0>, is_volatile=False, guard_types=['TENSOR_MATCH'], code_list=None, obj_weakref=<weakref at 0x7f0116fac2c0; to 'Tensor' at 0x7f0116ed6810>, guarded_class_weakref=<weakref at 0x7f0119143900; to 'torch._C._TensorMeta' at 0x5572ab714830 (Tensor)>),
	Guard(name='self.linear', source=<GuardSource.LOCAL_NN_MODULE: 2>, create_fn=<function GuardBuilder.NN_MODULE at 0x7f01161ab160>, is_volatile=False, guard_types=None, code_list=None, obj_weakref=None, guarded_class_weakref=None),
	Guard(name='torch', source=<GuardSource.GLOBAL: 1>, cr
	----------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
	----------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	quantized::conv_relu_int8_packed_weight 13.77% 69.356ms 20.19% 101.737ms 3.083ms 33
	quantized::conv_int8_packed_weight 13.59% 68.488ms 19.10% 96.245ms 4.812ms 20
	graph_1_kernel_cpp_1 9.96% 50.175ms 9.96% 50.175ms 50.175ms 1
	graph_1_kernel_cpp_2 9.32% 46.958ms 9.32% 46.958ms 46.958ms 1
	graph_1_kernel_cpp_3 6.54% 32.936ms 6.54%
	kernel_cpp_0 = async_compile.cpp('''
	#include <ATen/record_function.h>
	#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
	extern "C" void kernel(const float* __restrict__ in_ptr0,
	const float* __restrict__ in_ptr1,
	const long* __restrict__ in_ptr2,
	unsigned char* __restrict__ out_ptr0)
	{
	RECORD_FUNCTION("graph_1_kernel_cpp_0", c10::ArrayRef<c10::IValue>({}));
	#pragma omp parallel num_threads(28)
	from ctypes import c_void_p, c_long
	import torch
	import random
	from torch import empty_strided, as_strided, device
	from torch._inductor.codecache import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels

	aten = torch.ops.aten
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	async_compile = AsyncCompile()
	import torch
	import torch._dynamo as torchdynamo
	from torch.ao.quantization import (
	get_default_qconfig,
	QConfigMapping,
	)
	from torch.ao.quantization._quantize_pt2e import prepare_pt2e, convert_pt2e
	from torch._inductor.compile_fx import compile_fx

	def test_single_conv():
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	from torch import empty_strided, as_strided, device
	from torch._inductor.codecache import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels

	aten = torch.ops.aten
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	# Running CMD is: clear && TORCHDYNAMO_DYNAMIC_SHAPES=1 python test_rn50.py 2>&1 \| tee test.log

	import torch
	import torch._dynamo as torchdynamo
	import torchvision.models as models
	import copy
	from torch._inductor.compile_fx import compile_fx

	import logging
	torch._dynamo.config.log_level = logging.DEBUG