Jianbin Chang shjwudp

## gist:9eb08bbaf46d9b12239aa62a54d35a48
baseline
#
# nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 validation: 1
#
# Using devices
#   Rank  0 Pid 146066 on bjlt03-hi13 device  0 [0x1a] Tesla V100-SXM2-32GB
#   Rank  1 Pid  67084 on bjlt03-hi52 device  0 [0x1a] Tesla V100-SXM2-32GB
#   Rank  2 Pid  34899 on bjlt03-hi65 device  0 [0x1a] Tesla V100-SXM2-32GB
#   Rank  3 Pid  34368 on bjlt03-hi89 device  0 [0x1a] Tesla V100-SXM2-32GB
#

## gist:068f0de273a850f6f6b676d650301def
baguarun \
    --nproc_per_node 1 \
    -x PATH \
    -x LD_LIBRARY_PATH \
    --bagua_service_port=8129 \
    --master_port 8130 \
    --no_python \
        bash install_nccl.sh

# install_nccl.sh

## gist:94797ba6cf71d5aa0a9c6a1ae58a7501
root@bjlt03-hi43:/share/ai_platform/changjianbin/git.corp.kuaishou.com/shendong/transform_moe# pip list
Package                Version
---------------------- -----------
apex                   0.1
attrs                  21.2.0
backcall               0.2.0
bagua                  0.8.2.dev46
beautifulsoup4         4.9.3
black                  21.8b0
brotlipy               0.7.0

## gist:f260c6ed1dfa971e5934fb4979a40775
FROM registry.corp.kuaishou.com/kml-eth-interns/bagua:bagua0.5.0-pytorch1.9.0-cuda11.1-cudnn8-byteps

RUN conda create -n python27 python=2.7 -y \
    && conda init bash \
    && . ~/.bashrc \
    && conda activate python27 \
    && export http_proxy=http://oversea-squid1.jp.txyun:11080 && export https_proxy=http://oversea-squid1.jp.txyun:11080 \
    && wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-5.1-2.3.7.1/MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64.tgz \
    && tar xzf MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64.tgz \
    && cd MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64 \

## gist:3f376300d764625381bb197ba6a2afeb
import torch
import torch.distributed as dist
import time
import os
import logging
import bagua.torch_api as bagua


def all_to_all(output_tensor_list, input_tensor_list, h_group):
    rank = dist.get_rank()

## gist:eb5337d2119bcd4384f3c53e8097c8f1
#!/bin/bash


DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs

# export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=1

## gist:39f4ae0dacf3bcd86c4b67e17fd9796e
Machine (1007GB total)
  NUMANode L#0 (P#0 503GB)
    Package L#0 + L3 L#0 (48MB)
      L2 L#0 (1280KB) + L1d L#0 (48KB) + L1i L#0 (32KB) + Core L#0
        PU L#0 (P#0)
        PU L#1 (P#64)
      L2 L#1 (1280KB) + L1d L#1 (48KB) + L1i L#1 (32KB) + Core L#1
        PU L#2 (P#1)
        PU L#3 (P#65)
      L2 L#2 (1280KB) + L1d L#2 (48KB) + L1i L#2 (32KB) + Core L#2

## gist:b123bdfadc1cd1886139e89f9213c570
import math
import torch
from torch.optim.optimizer import Optimizer


class Adam16(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps,

## gist:5d474ceff420cabe355c1ee85b07b665
import math
import torch
from torch.optim.optimizer import Optimizer

class MinMaxCompressTensor:
    def __init__(self, t: torch.Tensor) -> None:
        self.compressed_t = MinMaxCompressTensor._minmax_compress(t)
        self.min_ = t.min()
        self.max_ = t.max()

## gist:28a390bb7d6dc54b14c54478d4faeb58
import sys
import json

import nltk


def is_repetition_removal(
    text, duplicate_line_fraction=0.3, duplicate_line_character_faction=0.2
):
    line_count = 0
	baseline
	#
	# nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 validation: 1
	#
	# Using devices
	# Rank 0 Pid 146066 on bjlt03-hi13 device 0 [0x1a] Tesla V100-SXM2-32GB
	# Rank 1 Pid 67084 on bjlt03-hi52 device 0 [0x1a] Tesla V100-SXM2-32GB
	# Rank 2 Pid 34899 on bjlt03-hi65 device 0 [0x1a] Tesla V100-SXM2-32GB
	# Rank 3 Pid 34368 on bjlt03-hi89 device 0 [0x1a] Tesla V100-SXM2-32GB
	#
	baguarun \
	--nproc_per_node 1 \
	-x PATH \
	-x LD_LIBRARY_PATH \
	--bagua_service_port=8129 \
	--master_port 8130 \
	--no_python \
	bash install_nccl.sh

	# install_nccl.sh
	root@bjlt03-hi43:/share/ai_platform/changjianbin/git.corp.kuaishou.com/shendong/transform_moe# pip list
	Package Version
	---------------------- -----------
	apex 0.1
	attrs 21.2.0
	backcall 0.2.0
	bagua 0.8.2.dev46
	beautifulsoup4 4.9.3
	black 21.8b0
	brotlipy 0.7.0
	FROM registry.corp.kuaishou.com/kml-eth-interns/bagua:bagua0.5.0-pytorch1.9.0-cuda11.1-cudnn8-byteps

	RUN conda create -n python27 python=2.7 -y \
	&& conda init bash \
	&& . ~/.bashrc \
	&& conda activate python27 \
	&& export http_proxy=http://oversea-squid1.jp.txyun:11080 && export https_proxy=http://oversea-squid1.jp.txyun:11080 \
	&& wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-5.1-2.3.7.1/MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64.tgz \
	&& tar xzf MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64.tgz \
	&& cd MLNX_OFED_LINUX-5.1-2.3.7.1-ubuntu18.04-x86_64 \
	import torch
	import torch.distributed as dist
	import time
	import os
	import logging
	import bagua.torch_api as bagua


	def all_to_all(output_tensor_list, input_tensor_list, h_group):
	rank = dist.get_rank()
	#!/bin/bash


	DIR=`pwd`
	DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
	mkdir -p $DIR/logs

	# export NCCL_DEBUG=INFO
	export NCCL_IB_DISABLE=1
	Machine (1007GB total)
	NUMANode L#0 (P#0 503GB)
	Package L#0 + L3 L#0 (48MB)
	L2 L#0 (1280KB) + L1d L#0 (48KB) + L1i L#0 (32KB) + Core L#0
	PU L#0 (P#0)
	PU L#1 (P#64)
	L2 L#1 (1280KB) + L1d L#1 (48KB) + L1i L#1 (32KB) + Core L#1
	PU L#2 (P#1)
	PU L#3 (P#65)
	L2 L#2 (1280KB) + L1d L#2 (48KB) + L1i L#2 (32KB) + Core L#2
	import math
	import torch
	from torch.optim.optimizer import Optimizer


	class Adam16(Optimizer):

	def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
	weight_decay=0):
	defaults = dict(lr=lr, betas=betas, eps=eps,
	import sys
	import json

	import nltk


	def is_repetition_removal(
	text, duplicate_line_fraction=0.3, duplicate_line_character_faction=0.2
	):
	line_count = 0