Stas Bekman stas00

## sft_trainer.py
# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
#
# Usage:
#   - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
#   - Install deepspeed: `pip install deepspeed==0.9.5`
#   - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
#   - Clone the repo: git clone github.com/huggingface/trl.git
#   - Copy this Gist into trl/examples/scripts
#   - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py

## mfu_compute.py
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench

def get_flops_achieved(f):
    flop_counter = FlopCounterMode(display=False)
    with flop_counter:
        f()
    total_flops = flop_counter.get_total_flops()
    ms_per_iter = do_bench(f)

## ssh-connection-to-slurm-compute-job-container-enroot.md

      
              1 file
            
          
              3 forks
            
          
              0 comments
            
          
              14 stars
            
          
                malteos
                / ssh-connection-to-slurm-compute-job-container-enroot.md
            
            
              Last active
              July 17, 2024 21:07
            
          
    Connect via SSH to a Slurm compute job that runs as Enroot container

Being able to SSH directly into a compute job has the advantage of using all remote development tools
such as using your IDE's debugger also for GPU jobs (VSCode, PyCharm, ...).

Slurm: Scheduling system that many HPC clusters use
Enroot: Container system like Docker for NVIDIA GPUs

General problem:

  
## nn_mult.py
# Reference
# =========
# Why does deep and cheap learning work so well?∗
# Henry W. Lin, Max Tegmark, and David Rolnick
# Dept. of Physics, Harvard University, Cambridge, MA 02138
# Dept. of Physics, Massachusetts Institute of Technology, Cambridge, MA 02139 and Dept. of Mathematics, Massachusetts Institute of Technology, Cambridge, MA 02139
# Here the x input takes two numbers and produces the multiplication as output
import numpy as np

# multiplication approximator

## profile.py
'''
Memory profiling utilities
'''
import gc
import inspect
import linecache
import os.path
import sys
import time
import threading

## pytorch-losses-in-plain-python.ipynb

      
              1 file
            
          
              8 forks
            
          
              0 comments
            
          
              19 stars
            
          
                yang-zhang
                / pytorch-losses-in-plain-python.ipynb
            
            
              Last active
              December 21, 2022 07:14
            
              
                git/yang-zhang.github.io/ds_code/pytorch-losses-in-plain-python.ipynb
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## titanic.ipynb

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              2 stars
            
          
                tonygentilcore
                / titanic.ipynb
            
            
              Created
              March 21, 2018 21:29
            
              
                Titanic Kaggle w/ fast.ai
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## git-gh-setup.sh
#!/usr/bin/env bash

git remote -v | grep fetch | grep github | \
    while read remote url _; do
        if ! git config --get-all "remote.$remote.fetch" | grep -q refs/pull
        then
            git config --add "remote.$remote.fetch" \
                '+refs/pull/*/head:refs/remotes/'"$remote"'/pull/*'
        fi
    done

## pr.md

      
              1 file
            
          
              506 forks
            
          
              158 comments
            
          
              3665 stars
            
          
                piscisaureus
                / pr.md
            
            
              Created
              August 13, 2012 16:12
            
              
                Checkout github pull requests locally
              
          
    Locate the section for your github remote in the .git/config file. It looks like this:
[remote "origin"]
	fetch = +refs/heads/*:refs/remotes/origin/*
	url = git@github.com:joyent/node.git

Now add the line fetch = +refs/pull/*/head:refs/remotes/origin/pr/* to this section. Obviously, change the github url to match your project's URL. It ends up looking like this:
	# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
	# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
	#
	# Usage:
	# - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
	# - Install deepspeed: `pip install deepspeed==0.9.5`
	# - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
	# - Clone the repo: git clone github.com/huggingface/trl.git
	# - Copy this Gist into trl/examples/scripts
	# - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py
	import torch
	from torch.utils.flop_counter import FlopCounterMode
	from triton.testing import do_bench

	def get_flops_achieved(f):
	flop_counter = FlopCounterMode(display=False)
	with flop_counter:
	f()
	total_flops = flop_counter.get_total_flops()
	ms_per_iter = do_bench(f)
	# Reference
	# =========
	# Why does deep and cheap learning work so well?∗
	# Henry W. Lin, Max Tegmark, and David Rolnick
	# Dept. of Physics, Harvard University, Cambridge, MA 02138
	# Dept. of Physics, Massachusetts Institute of Technology, Cambridge, MA 02139 and Dept. of Mathematics, Massachusetts Institute of Technology, Cambridge, MA 02139
	# Here the x input takes two numbers and produces the multiplication as output
	import numpy as np

	# multiplication approximator
	'''
	Memory profiling utilities
	'''
	import gc
	import inspect
	import linecache
	import os.path
	import sys
	import time
	import threading
	#!/usr/bin/env bash

	git remote -v \| grep fetch \| grep github \| \
	while read remote url _; do
	if ! git config --get-all "remote.$remote.fetch" \| grep -q refs/pull
	then
	git config --add "remote.$remote.fetch" \
	'+refs/pull//head:refs/remotes/'"$remote"'/pull/'
	fi
	done