Stas Bekman stas00

## PushEvent.py
#!/usr/bin/python
from __future__ import print_function

#
# A simple CGI script useful for debugging GitHub web hooks
# https://developer.github.com/webhooks/
#

import hashlib, hmac, json, os, sys, traceback
from subprocess import Popen, PIPE, STDOUT

## hub-install.py
import os,requests,platform,json,subprocess
import tarfile
from zipfile import ZipFile

debug=False
os_type=platform.system().lower()
machine_type=platform.machine().lower()

if debug:print(f'Your OS and Machine Type is {os_type} and {machine_type}')

## prototype do_the_magic-inital_bs_test.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stas00
                / prototype do_the_magic-inital_bs_test.ipynb
            
            
              Last active
              March 8, 2019 04:34
                — forked from bfarzin/prototype do_the_magic-inital_bs_test.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## all_reduce_bench.py
# python -m torch.distributed.launch --nproc_per_node=2 all_reduce_bench.py

import torch
import torch.distributed as dist
import time
import argparse
import os
import fcntl

TRIALS = 5

## composable_actions.py
import argparse

def compose_actions(*actions):
    """Compose many argparse actions into one callable action.

    Args:
        *actions: The actions to compose.

    Returns:
        argparse.Action: Composed action.

## mp4_sharp_bug.py
import torch
import torch.distributed as dist
import os

local_rank = int(os.environ["LOCAL_RANK"])

dist.init_process_group(backend='nccl')

torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)

## layer-norm-fwd-bckwd.py
def layernorm_forward(x, gamma, beta, ln_param):
    """
    Forward pass for layer normalization.

    During both training and test-time, the incoming data is normalized per data-point,
    before being scaled by gamma and beta parameters identical to that of batch normalization.

    Note that in contrast to batch normalization, the behavior during train and test-time for
    layer normalization are identical, and we do not need to keep track of running averages
    of any sort.

## sft_trainer.py
# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
#
# Usage:
#   - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
#   - Install deepspeed: `pip install deepspeed==0.9.5`
#   - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
#   - Clone the repo: git clone github.com/huggingface/trl.git
#   - Copy this Gist into trl/examples/scripts
#   - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py

## ssh-connection-to-slurm-compute-job-container-enroot.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stas00
                / ssh-connection-to-slurm-compute-job-container-enroot.md
            
            
              Created
              October 15, 2023 00:53
                — forked from malteos/ssh-connection-to-slurm-compute-job-container-enroot.md
            
          
    Connect via SSH to a Slurm compute job that runs as Enroot container

Being able to SSH directly into a compute job has the advantage of using all remote development tools
such as using your IDE's debugger also for GPU jobs (VSCode, PyCharm, ...).

Slurm: Scheduling system that many HPC clusters use
Enroot: Container system like Docker for NVIDIA GPUs

General problem:

  
## calc_transformer_params.py
import argparse
import math

# Helper function to pretty-print message sizes
def convert_params(params):
    if params == 0:
        return "0"
    size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
    i = int(math.floor(math.log(params, 1000)))
    p = math.pow(1000, i)
	#!/usr/bin/python
	from __future__ import print_function

	#
	# A simple CGI script useful for debugging GitHub web hooks
	# https://developer.github.com/webhooks/
	#

	import hashlib, hmac, json, os, sys, traceback
	from subprocess import Popen, PIPE, STDOUT
	import os,requests,platform,json,subprocess
	import tarfile
	from zipfile import ZipFile

	debug=False
	os_type=platform.system().lower()
	machine_type=platform.machine().lower()

	if debug:print(f'Your OS and Machine Type is {os_type} and {machine_type}')
	# python -m torch.distributed.launch --nproc_per_node=2 all_reduce_bench.py

	import torch
	import torch.distributed as dist
	import time
	import argparse
	import os
	import fcntl

	TRIALS = 5
	import argparse

	def compose_actions(*actions):
	"""Compose many argparse actions into one callable action.

	Args:
	*actions: The actions to compose.

	Returns:
	argparse.Action: Composed action.
	def layernorm_forward(x, gamma, beta, ln_param):
	"""
	Forward pass for layer normalization.

	During both training and test-time, the incoming data is normalized per data-point,
	before being scaled by gamma and beta parameters identical to that of batch normalization.

	Note that in contrast to batch normalization, the behavior during train and test-time for
	layer normalization are identical, and we do not need to keep track of running averages
	of any sort.
	# This is a modified version of TRL's `SFTTrainer` example (https://github.com/huggingface/trl/blob/main/examples/scripts/sft_trainer.py),
	# adapted to run with DeepSpeed ZeRO-3 and Mistral-7B-V1.0. The settings below were run on 1 node of 8 x A100 (80GB) GPUs.
	#
	# Usage:
	# - Install the latest transformers & accelerate versions: `pip install -U transformers accelerate`
	# - Install deepspeed: `pip install deepspeed==0.9.5`
	# - Install TRL from main: pip install git+https://github.com/huggingface/trl.git
	# - Clone the repo: git clone github.com/huggingface/trl.git
	# - Copy this Gist into trl/examples/scripts
	# - Run from root of trl repo with: accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --gradient_accumulation_steps 8 examples/scripts/sft_trainer.py
	import argparse
	import math

	# Helper function to pretty-print message sizes
	def convert_params(params):
	if params == 0:
	return "0"
	size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
	i = int(math.floor(math.log(params, 1000)))
	p = math.pow(1000, i)