Skip to content

Instantly share code, notes, and snippets.

export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=1
export TENSOR_SIZE=262144
export NPARAM=4
export OMP_NUM_THREADS=1
echo "Tensor Size: $TENSOR_SIZE, Num Tensor $NPARAM, Num Threads: $OMP_NUM_THREADS"
numactl -C 1 -m 0 python adagrad.py
import torch
from torch.optim.adagrad import _single_tensor_adagrad, _fused_adagrad
import copy
device='cpu'
dtype=torch.float
import os
TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
NPARAM = int(os.getenv("NPARAM", 4))
@zhuhaozhe
zhuhaozhe / bench.sh
Last active April 23, 2024 03:14
laucher for sgd bench
export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libjemalloc.so
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=1
export TENSOR_SIZE=262144
export NPARAM=4
export OMP_NUM_THREADS=1
echo "Tensor Size: $TENSOR_SIZE, Num Tensor $NPARAM, Num Threads: $OMP_NUM_THREADS"
numactl -C 1 -m 0 python sgd.py
@zhuhaozhe
zhuhaozhe / sgd.py
Last active April 23, 2024 03:14
sgd benchmark
import torch
from torch.optim.sgd import _single_tensor_sgd, _fused_sgd
import copy
device='cpu'
dtype=torch.float
import os
TENSOR_SIZE = (int(os.getenv('TENSOR_SIZE', 512 * 512)), )
NPARAM = int(os.getenv("NPARAM", 4))
@zhuhaozhe
zhuhaozhe / scalar.py
Last active April 17, 2024 06:09
scalar.py
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
@zhuhaozhe
zhuhaozhe / benchmark-fused-adam.py
Created April 1, 2024 07:50
benchmark-fused-adam
from typing import List, Optional, Union
import torch
from torch import Tensor
from torch.optim.optimizer import _get_value, _dispatch_sqrt
NPARAM = 10
TENSOR_SIZE = 1024 * 1024
cpp_fused_mul_sum_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'float*'], '''
#include "/tmp/torchinductor_root/lg/clghje745biezhrbrw5fghxqjaj76ck5jms7466s4ax63eruswf5.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
float* out_ptr0)
{
{
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);