Zhen Zhang zarzen

## nsight.sh
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

# https://developer.nvidia.com/nsight-systems
# https://docs.nvidia.com/nsight-systems/profiling/index.html

# My preferred nsys (command line executable used to create profiles) commands
#
# In your script, write
# torch.cuda.nvtx.range_push("region name")
# ...

## tmux.config
set-option -sa terminal-overrides ",xterm*:Tc"

set -g mouse on

unbind C-b
set -g prefix C-t
bind C-t send-prefix

# List of plugins
# this requires install tpm

## init.lua
-- Install packer
local install_path = vim.fn.stdpath 'data' .. '/site/pack/packer/start/packer.nvim'
local is_bootstrap = false
if vim.fn.empty(vim.fn.glob(install_path)) > 0 then
  is_bootstrap = true
  vim.fn.system { 'git', 'clone', '--depth', '1', 'https://github.com/wbthomason/packer.nvim', install_path }
  vim.cmd [[packadd packer.nvim]]
end

require('packer').startup(function(use)

## bench_all_gather_coalescing.py
"""
call to _all_gather_base with  c10d._coalescing_manager
Test command:
mpirun -np $1 -N ${ndev_per_node} --hostfile ${HOST_FILE} \
    --mca plm_rsh_no_tree_spawn 1 \
    -mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
    --mca pml ^cm \
    -bind-to none \
    --tag-output \
    -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \

## fused_lamb_cuda_kernel.patch
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index e934b69c..207faa39 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -8,7 +8,7 @@
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/detail/IndexUtils.cuh"
 //#include "ATen/Type.h"
-#include <THC/THCGeneral.h>
+// #include <THC/THCGeneral.h>

## ds-pt1.11.patch
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 0448a45..ff87993 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -464,7 +464,7 @@ void fused_lamb_cuda(at::Tensor& p,
                         lamb_coeff.data<scalar_t>());
             }));
     }
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());

## README.md

      
              3 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                zarzen
                / README.md
            
            
              Last active
              November 8, 2021 19:25
            
              
                deepspeed_loss_test
              
          
    Usage

python3 test_diff_stages.py


## checkpointing_true_bug.log
21: M9 P[5, 6] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 5.1e+03, inflight [9]
-gather param for module 3: {'id': 0, 'status': 'AVAILABLE', 'numel': 78151680, 'persist': False, 'active_sub_modules': {3}}
[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 9
[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] module id 9 handle is None
22: M23 P[] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 7.8e+07, inflight [0, 23, 2, 1, 3]
[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 23
[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] module id 23 handle is None
-gather param for module 24: {'id': 151, 'status': 'NOT_AVAILABLE', 'numel': 6553600, 'persist': False, 'active_sub_modules': {24}}
-gather param for module 24: {'id': 152, 'status': 'AVAILABLE', 'numel': 2560, 'persist': True, 'active_sub_modules': {24}}
[2021-07-07 21:16:52,636] [INFO] [utils.py:629:info_rank_

## model_config.json
{
    "train_batch_size": 512,
    "train_micro_batch_size_per_gpu": 8,
    "steps_per_print": 100,
    "prescale_gradients": false,
    "bert_token_file": "bert-large-uncased",
    "bert_model_config": {
        "vocab_size_or_config_json_file": 32003,
        "hidden_size": 2560,
        "num_hidden_layers": 64,

## strip_latex.py
import re
import argparse

def get_args():
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--file')

    args = arg_parser.parse_args()
    return args
	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...
	set-option -sa terminal-overrides ",xterm*:Tc"

	set -g mouse on

	unbind C-b
	set -g prefix C-t
	bind C-t send-prefix

	# List of plugins
	# this requires install tpm
	-- Install packer
	local install_path = vim.fn.stdpath 'data' .. '/site/pack/packer/start/packer.nvim'
	local is_bootstrap = false
	if vim.fn.empty(vim.fn.glob(install_path)) > 0 then
	is_bootstrap = true
	vim.fn.system { 'git', 'clone', '--depth', '1', 'https://github.com/wbthomason/packer.nvim', install_path }
	vim.cmd [[packadd packer.nvim]]
	end

	require('packer').startup(function(use)
	"""
	call to _all_gather_base with c10d._coalescing_manager
	Test command:
	mpirun -np $1 -N ${ndev_per_node} --hostfile ${HOST_FILE} \
	--mca plm_rsh_no_tree_spawn 1 \
	-mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
	--mca pml ^cm \
	-bind-to none \
	--tag-output \
	-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
	diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
	index e934b69c..207faa39 100644
	--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
	+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
	@@ -8,7 +8,7 @@
	#include "ATen/cuda/CUDAContext.h"
	#include "ATen/cuda/detail/IndexUtils.cuh"
	//#include "ATen/Type.h"
	-#include <THC/THCGeneral.h>
	+// #include <THC/THCGeneral.h>
	diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
	index 0448a45..ff87993 100644
	--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
	+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
	@@ -464,7 +464,7 @@ void fused_lamb_cuda(at::Tensor& p,
	lamb_coeff.data<scalar_t>());
	}));
	}
	- THCudaCheck(cudaGetLastError());
	+ AT_CUDA_CHECK(cudaGetLastError());
	21: M9 P[5, 6] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 5.1e+03, inflight [9]
	-gather param for module 3: {'id': 0, 'status': 'AVAILABLE', 'numel': 78151680, 'persist': False, 'active_sub_modules': {3}}
	[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 9
	[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] module id 9 handle is None
	22: M23 P[] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 7.8e+07, inflight [0, 23, 2, 1, 3]
	[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 23
	[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] module id 23 handle is None
	-gather param for module 24: {'id': 151, 'status': 'NOT_AVAILABLE', 'numel': 6553600, 'persist': False, 'active_sub_modules': {24}}
	-gather param for module 24: {'id': 152, 'status': 'AVAILABLE', 'numel': 2560, 'persist': True, 'active_sub_modules': {24}}
	[2021-07-07 21:16:52,636] [INFO] [utils.py:629:info_rank_
	{
	"train_batch_size": 512,
	"train_micro_batch_size_per_gpu": 8,
	"steps_per_print": 100,
	"prescale_gradients": false,
	"bert_token_file": "bert-large-uncased",
	"bert_model_config": {
	"vocab_size_or_config_json_file": 32003,
	"hidden_size": 2560,
	"num_hidden_layers": 64,
	import re
	import argparse

	def get_args():
	arg_parser = argparse.ArgumentParser()
	arg_parser.add_argument('--file')

	args = arg_parser.parse_args()
	return args