Stas Bekman stas00

## static_kv_cache.py
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
device = "cuda"

# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

## Mellanox OFED cheat sheet
--------------------------------------------------------------------------
# ofed_info -s
--------------------------------------------------------------------------
Find Mellanox Adapter Type and Firmware/Driver version
ConnectX-4 card

# lspci | grep Mellanox
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {}

## mm_bmm-perf.py
# Benchmark relative performance of torch.mm and torch.bmm with single batch
import torch
import time


def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
    if use_kineto:
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
            fn(*args)
        return sum([e.cuda_time for e in p.key_averages()])

## mfu_compute.py
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench

def get_flops_achieved(f):
    flop_counter = FlopCounterMode(display=False)
    with flop_counter:
        f()
    total_flops = flop_counter.get_total_flops()
    ms_per_iter = do_bench(f)

## vram.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stas00
                / vram.ipynb
            
            
              Created
              December 18, 2023 03:12
            
              
                memory allocations breakdown
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## calc_transformer_flops.py
import argparse
import math

# Helper function to pretty-print message sizes
def convert_flops(params):
    if params == 0:
        return "0"
    size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
    i = int(math.floor(math.log(params, 1000)))
    p = math.pow(1000, i)

## calc_transformer_params.py
import argparse
import math

# Helper function to pretty-print message sizes
def convert_params(params):
    if params == 0:
        return "0"
    size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
    i = int(math.floor(math.log(params, 1000)))
    p = math.pow(1000, i)

## benchmark_dist_init.py
# run as:
# python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000  --rdzv_backend c10d benchmark_dist_init.py

import torch
import os
import cProfile

import torch.distributed as dist
import timeit

## ssh-connection-to-slurm-compute-job-container-enroot.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stas00
                / ssh-connection-to-slurm-compute-job-container-enroot.md
            
            
              Created
              October 15, 2023 00:53
                — forked from malteos/ssh-connection-to-slurm-compute-job-container-enroot.md
            
          
    Connect via SSH to a Slurm compute job that runs as Enroot container

Being able to SSH directly into a compute job has the advantage of using all remote development tools
such as using your IDE's debugger also for GPU jobs (VSCode, PyCharm, ...).

Slurm: Scheduling system that many HPC clusters use
Enroot: Container system like Docker for NVIDIA GPUs

General problem:

  
## sft_trainer.py
# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
#
# Usage:
#   - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
#   - Install deepspeed: `pip install deepspeed==0.9.5`
#   - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
#   - Clone the repo: git clone github.com/huggingface/trl.git
#   - Copy this Gist into trl/examples/scripts
#   - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py
	from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
	import torch
	from typing import Optional
	device = "cuda"

	# Copied from the gpt-fast repo
	def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
	--------------------------------------------------------------------------
	# ofed_info -s
	--------------------------------------------------------------------------
	Find Mellanox Adapter Type and Firmware/Driver version
	ConnectX-4 card

	# lspci \| grep Mellanox
	0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
	# lspci -vv -s 0a:00.0 \| grep "Part number" -A 3
	# lspci \| grep Mellanox \| awk '{print $1}' \| xargs -i -r mstvpd {}
	# Benchmark relative performance of torch.mm and torch.bmm with single batch
	import torch
	import time


	def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
	if use_kineto:
	with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
	fn(*args)
	return sum([e.cuda_time for e in p.key_averages()])
	import torch
	from torch.utils.flop_counter import FlopCounterMode
	from triton.testing import do_bench

	def get_flops_achieved(f):
	flop_counter = FlopCounterMode(display=False)
	with flop_counter:
	f()
	total_flops = flop_counter.get_total_flops()
	ms_per_iter = do_bench(f)
	import argparse
	import math

	# Helper function to pretty-print message sizes
	def convert_flops(params):
	if params == 0:
	return "0"
	size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
	i = int(math.floor(math.log(params, 1000)))
	p = math.pow(1000, i)
	# run as:
	# python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000 --rdzv_backend c10d benchmark_dist_init.py

	import torch
	import os
	import cProfile

	import torch.distributed as dist
	import timeit
	# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
	# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
	#
	# Usage:
	# - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
	# - Install deepspeed: `pip install deepspeed==0.9.5`
	# - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
	# - Clone the repo: git clone github.com/huggingface/trl.git
	# - Copy this Gist into trl/examples/scripts
	# - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py