zhuhaozhe

## bench-adagrad.sh
export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=1

export TENSOR_SIZE=262144
export NPARAM=4
export OMP_NUM_THREADS=1
echo "Tensor Size: $TENSOR_SIZE, Num Tensor $NPARAM, Num Threads: $OMP_NUM_THREADS"
numactl -C 1 -m 0 python adagrad.py

## adagrad.py
import torch
from torch.optim.adagrad import _single_tensor_adagrad, _fused_adagrad
import copy
device='cpu'
dtype=torch.float
import os

TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
NPARAM = int(os.getenv("NPARAM", 4))

## bench.sh
export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=1

export TENSOR_SIZE=262144
export NPARAM=4
export OMP_NUM_THREADS=1
echo "Tensor Size: $TENSOR_SIZE, Num Tensor $NPARAM, Num Threads: $OMP_NUM_THREADS"
numactl -C 1 -m 0 python sgd.py

## sgd.py
import torch
from torch.optim.sgd import _single_tensor_sgd, _fused_sgd
import copy
device='cpu'
dtype=torch.float
import os

TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
NPARAM = int(os.getenv("NPARAM", 4))

## scalar.py

from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile

## vec.py

from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile

## benchmark-fused-adam.py
from typing import List, Optional, Union

import torch
from torch import Tensor
from torch.optim.optimizer import _get_value, _dispatch_sqrt

NPARAM = 10
TENSOR_SIZE = 1024 * 1024


## gist:83481575093357ca603cfdd66d3fd37f
cpp_fused_mul_sum_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'float*'], '''
#include "/tmp/torchinductor_root/lg/clghje745biezhrbrw5fghxqjaj76ck5jms7466s4ax63eruswf5.h"
extern "C" void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       float* out_ptr0)
{
    {
        {
            float tmp_acc0 = 0;
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
	export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
	export KMP_AFFINITY=granularity=fine,compact,1,0
	export KMP_BLOCKTIME=1

	export TENSOR_SIZE=262144
	export NPARAM=4
	export OMP_NUM_THREADS=1
	echo "Tensor Size: $TENSOR_SIZE, Num Tensor $NPARAM, Num Threads: $OMP_NUM_THREADS"
	numactl -C 1 -m 0 python adagrad.py
	import torch
	from torch.optim.adagrad import _single_tensor_adagrad, _fused_adagrad
	import copy
	device='cpu'
	dtype=torch.float
	import os

	TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
	NPARAM = int(os.getenv("NPARAM", 4))
	import torch
	from torch.optim.sgd import _single_tensor_sgd, _fused_sgd
	import copy
	device='cpu'
	dtype=torch.float
	import os

	TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
	NPARAM = int(os.getenv("NPARAM", 4))

	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from typing import List, Optional, Union

	import torch
	from torch import Tensor
	from torch.optim.optimizer import _get_value, _dispatch_sqrt

	NPARAM = 10
	TENSOR_SIZE = 1024 * 1024
	cpp_fused_mul_sum_0 = async_compile.cpp_pybinding(['const float', 'const float', 'float*'], '''
	#include "/tmp/torchinductor_root/lg/clghje745biezhrbrw5fghxqjaj76ck5jms7466s4ax63eruswf5.h"
	extern "C" void kernel(const float* in_ptr0,
	const float* in_ptr1,
	float* out_ptr0)
	{
	{
	{
	float tmp_acc0 = 0;
	at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);