Stas Bekman stas00

## static_kv_cache.py
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
device = "cuda"

# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

## Mellanox OFED cheat sheet
--------------------------------------------------------------------------
# ofed_info -s
--------------------------------------------------------------------------
Find Mellanox Adapter Type and Firmware/Driver version
ConnectX-4 card

# lspci | grep Mellanox
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {}

## mm_bmm-perf.py
# Benchmark relative performance of torch.mm and torch.bmm with single batch
import torch
import time


def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
    if use_kineto:
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
            fn(*args)
        return sum([e.cuda_time for e in p.key_averages()])

## mfu_compute.py
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench

def get_flops_achieved(f):
    flop_counter = FlopCounterMode(display=False)
    with flop_counter:
        f()
    total_flops = flop_counter.get_total_flops()
    ms_per_iter = do_bench(f)

## calc_transformer_flops.py
import argparse
import math

# Helper function to pretty-print message sizes
def convert_flops(params):
    if params == 0:
        return "0"
    size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
    i = int(math.floor(math.log(params, 1000)))
    p = math.pow(1000, i)

## calc_transformer_params.py
import argparse
import math

# Helper function to pretty-print message sizes
def convert_params(params):
    if params == 0:
        return "0"
    size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
    i = int(math.floor(math.log(params, 1000)))
    p = math.pow(1000, i)

## ssh-connection-to-slurm-compute-job-container-enroot.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stas00
                / ssh-connection-to-slurm-compute-job-container-enroot.md
            
            
              Created
              October 15, 2023 00:53
                — forked from malteos/ssh-connection-to-slurm-compute-job-container-enroot.md
            
          
    Connect via SSH to a Slurm compute job that runs as Enroot container

Being able to SSH directly into a compute job has the advantage of using all remote development tools
such as using your IDE's debugger also for GPU jobs (VSCode, PyCharm, ...).

Slurm: Scheduling system that many HPC clusters use
Enroot: Container system like Docker for NVIDIA GPUs

General problem:

  
## sft_trainer.py
# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
#
# Usage:
#   - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
#   - Install deepspeed: `pip install deepspeed==0.9.5`
#   - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
#   - Clone the repo: git clone github.com/huggingface/trl.git
#   - Copy this Gist into trl/examples/scripts
#   - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py

## layer-norm-fwd-bckwd.py
def layernorm_forward(x, gamma, beta, ln_param):
    """
    Forward pass for layer normalization.

    During both training and test-time, the incoming data is normalized per data-point,
    before being scaled by gamma and beta parameters identical to that of batch normalization.

    Note that in contrast to batch normalization, the behavior during train and test-time for
    layer normalization are identical, and we do not need to keep track of running averages
    of any sort.

## mp4_sharp_bug.py
import torch
import torch.distributed as dist
import os

local_rank = int(os.environ["LOCAL_RANK"])

dist.init_process_group(backend='nccl')

torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
	from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
	import torch
	from typing import Optional
	device = "cuda"

	# Copied from the gpt-fast repo
	def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
	--------------------------------------------------------------------------
	# ofed_info -s
	--------------------------------------------------------------------------
	Find Mellanox Adapter Type and Firmware/Driver version
	ConnectX-4 card

	# lspci \| grep Mellanox
	0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
	# lspci -vv -s 0a:00.0 \| grep "Part number" -A 3
	# lspci \| grep Mellanox \| awk '{print $1}' \| xargs -i -r mstvpd {}
	# Benchmark relative performance of torch.mm and torch.bmm with single batch
	import torch
	import time


	def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
	if use_kineto:
	with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
	fn(*args)
	return sum([e.cuda_time for e in p.key_averages()])
	import torch
	from torch.utils.flop_counter import FlopCounterMode
	from triton.testing import do_bench

	def get_flops_achieved(f):
	flop_counter = FlopCounterMode(display=False)
	with flop_counter:
	f()
	total_flops = flop_counter.get_total_flops()
	ms_per_iter = do_bench(f)
	import argparse
	import math

	# Helper function to pretty-print message sizes
	def convert_flops(params):
	if params == 0:
	return "0"
	size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
	i = int(math.floor(math.log(params, 1000)))
	p = math.pow(1000, i)
	# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
	# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
	#
	# Usage:
	# - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
	# - Install deepspeed: `pip install deepspeed==0.9.5`
	# - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
	# - Clone the repo: git clone github.com/huggingface/trl.git
	# - Copy this Gist into trl/examples/scripts
	# - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py
	def layernorm_forward(x, gamma, beta, ln_param):
	"""
	Forward pass for layer normalization.

	During both training and test-time, the incoming data is normalized per data-point,
	before being scaled by gamma and beta parameters identical to that of batch normalization.

	Note that in contrast to batch normalization, the behavior during train and test-time for
	layer normalization are identical, and we do not need to keep track of running averages
	of any sort.
	import torch
	import torch.distributed as dist
	import os

	local_rank = int(os.environ["LOCAL_RANK"])

	dist.init_process_group(backend='nccl')

	torch.cuda.set_device(local_rank)
	device = torch.device("cuda", local_rank)