Skip to content

Instantly share code, notes, and snippets.

Avatar

Zhen Zhang zarzen

View GitHub Profile
View fused_lamb_cuda_kernel.patch
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index e934b69c..207faa39 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -8,7 +8,7 @@
#include "ATen/cuda/CUDAContext.h"
#include "ATen/cuda/detail/IndexUtils.cuh"
//#include "ATen/Type.h"
-#include <THC/THCGeneral.h>
+// #include <THC/THCGeneral.h>
@zarzen
zarzen / ds-pt1.11.patch
Last active Nov 16, 2021
Ds-pt1.11.patch
View ds-pt1.11.patch
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 0448a45..ff87993 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -464,7 +464,7 @@ void fused_lamb_cuda(at::Tensor& p,
lamb_coeff.data<scalar_t>());
}));
}
- THCudaCheck(cudaGetLastError());
+ AT_CUDA_CHECK(cudaGetLastError());
@zarzen
zarzen / README.md
Last active Nov 8, 2021
deepspeed_loss_test
View README.md

Usage

python3 test_diff_stages.py
View checkpointing_true_bug.log
21: M9 P[5, 6] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 5.1e+03, inflight [9]
-gather param for module 3: {'id': 0, 'status': 'AVAILABLE', 'numel': 78151680, 'persist': False, 'active_sub_modules': {3}}
[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 9
[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] module id 9 handle is None
22: M23 P[] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 7.8e+07, inflight [0, 23, 2, 1, 3]
[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 23
[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] module id 23 handle is None
-gather param for module 24: {'id': 151, 'status': 'NOT_AVAILABLE', 'numel': 6553600, 'persist': False, 'active_sub_modules': {24}}
-gather param for module 24: {'id': 152, 'status': 'AVAILABLE', 'numel': 2560, 'persist': True, 'active_sub_modules': {24}}
[2021-07-07 21:16:52,636] [INFO] [utils.py:629:info_rank_
@zarzen
zarzen / model_config.json
Created Jun 19, 2021
bert 5.1B model config
View model_config.json
{
"train_batch_size": 512,
"train_micro_batch_size_per_gpu": 8,
"steps_per_print": 100,
"prescale_gradients": false,
"bert_token_file": "bert-large-uncased",
"bert_model_config": {
"vocab_size_or_config_json_file": 32003,
"hidden_size": 2560,
"num_hidden_layers": 64,
@zarzen
zarzen / strip_latex.py
Created May 7, 2021
strip latex code for grammarly check
View strip_latex.py
import re
import argparse
def get_args():
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--file')
args = arg_parser.parse_args()
return args
View etcd_rendz.py
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import datetime
import json
@zarzen
zarzen / launch.sh
Created Apr 16, 2021
nccl-tests mpirun launch
View launch.sh
#!/bin/bash
NP=8
HOSTS="127.0.0.1:8"
MPI_HOME="/opt/amazon/openmpi"
TEST_BIN="/home/ubuntu/nccl-tests/build/all_reduce_perf"
MPI_BIN="${MPI_HOME}/bin/mpirun"
LD_LIBRARY_PATH="${MPI_HOME}/lib":$LD_LIBRARY_PATH
@zarzen
zarzen / limit_bandwidth.sh
Created Jan 4, 2021
根据IP限制带宽
View limit_bandwidth.sh
#! /bin/bash
# 注意:这个脚本只有对本地有用,比如node0 上做了限制,但是iperf -s 在node0上运行
# node1 连接到 node0 通过iperf -c node0-ip -P5 这样的情况带宽无法得到限制
# 只能是node0 连接其他node时候这个限制有作用
# 原代码链接: https://serverfault.com/questions/191560/how-can-i-do-traffic-shaping-in-linux-by-ip
NETCARD=ens5 # 改这边
MAXBANDWIDTH=40000 # 选个大点的就行
# reinit
tc qdisc del dev $NETCARD root handle 1
@zarzen
zarzen / vgg16.py
Last active Oct 19, 2020
singleNodeTraining
View vgg16.py
from torchvision import datasets, transforms, models
import torch
import torchvision
from torch import optim
import os
import torch.nn.functional as F
__n_threads = 4
print('torch num threads:', __n_threads)
torch.set_num_threads(__n_threads)