ajindal1/README.md Secret

## README.md

      
    Raw
  

              README.md
            
          
    There are three files in this:

build_pytorch_from_source.sh: Main file to reproduce the error. You need to change only the current directory path in this file.
install_msccl_mpi_torch.sh: File for building pytorch from source, no changes required.
install_hpcx_mpi_ucx.sh: File for installing mpi. You need to find a way to install HPCX MPI package, I can't provide the link to it as it is confidential.


## build_pytorch_from_source.sh
mkdir test_pytorch2
cd test_pytorch2
git clone https://github.com/pytorch/pytorch.git
git clone https://github.com/pytorch/builder.git

cd pytorch
git checkout v2.0.0
git submodule sync
git submodule update --init --recursive
cd ..

cd builder
git checkout release/2.0
GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION=11.7 manywheel/build_docker.sh
cd ..

# Copy below two files in test_pytorch2 folder
# Copy install_msccl_mpi_torch.sh
# Copy install_hpcx_mpi_ucx.sh

# REPLACE the directory path with your location
# export BASE_FOLDER_PATH="/home/abjindal/test_pytorch2/"
export BASE_FOLDER_PATH=<ENTER_YOUR_PATH>

echo "Set Correct Directory Path before proceeding"

chmod +x install_msccl_mpi_torch.sh
cp install_msccl_mpi_torch.sh ${BASE_FOLDER_PATH}builder/manywheel/

export PYTORCH_BUILD_VERSION=2.0.0
export OVERRIDE_PACKAGE_VERSION=2.0.0
export PYTORCH_BUILD_VERSION_PREFIX=$(echo ${PYTORCH_BUILD_VERSION} | awk -F. '{print $1"."$2}')
export DESIRED_CUDA=117
export CUDA_HOME_PATH=/usr/local/cuda-${DESIRED_CUDA:0:2}.${DESIRED_CUDA:2:1}
export DESIRED_PYTHON=3.8

cd pytorch

sudo docker run --rm --pull never -e USE_NCCL=1 -e NCCL_ROOT=/usr/local/cuda-11.7 -e NCCL_LIB_DIR=/usr/local/cuda-11.7/lib \
        -e NCCL_INCLUDE_DIR=/usr/local/cuda-11.7/include -e CUDA_HOME=$CUDA_HOME_PATH -e USE_DISTRIBUTED=1 \
        -e PYTORCH_FINAL_PACKAGE_DIR=/builder/conda/package -e SKIP_ALL_TESTS=1 -e BUILD_SPLIT_CUDA=ON \
        -e DESIRED_CUDA=${DESIRED_CUDA:0:2}.${DESIRED_CUDA:2:1} -e GPU_ARCH_TYPE=cuda -e DESIRED_PYTHON=$DESIRED_PYTHON \
        -e PYTORCH_BUILD_VERSION=$PYTORCH_BUILD_VERSION -e PYTORCH_BUILD_NUMBER=1 -e USE_CUPTI_SO=1 \
        -e OVERRIDE_PACKAGE_VERSION=$OVERRIDE_PACKAGE_VERSION -e PYTORCH_ROOT=/pytorch -e INSTALL_MPI=true -e USE_MPI=1 \
        -v ${BASE_FOLDER_PATH}/pytorch:/pytorch \
        -v ${BASE_FOLDER_PATH}builder:/builder -v ${BASE_FOLDER_PATH}install_hpcx_mpi_ucx.sh:/install_hpcx_mpi_ucx.sh \
        -v "$(pwd):/final_pkgs" -u root \
        pytorch/manylinux-cuda117 \
        /builder/manywheel/install_msccl_mpi_torch.sh

## install_hpcx_mpi_ucx.sh
#!/usr/bin/env bash
set -eoux pipefail

# use Mellanox-HPC-X package to install mpi and ucx to configure mpi correctly with cuda and build cuda aware mpi that
# is needed for models in torch that use mpi as distributed backend
# Details of package https://docs.nvidia.com/networking/pages/viewpage.action?pageId=12006256
# AZHPC images use similar installation steps: https://github.com/Azure/azhpc-images/blob/master/ubuntu/common/install_mpis.sh
# The HPCX packages are downloaded and stored in blob storage


OS_DISTRIBUTION=$(awk -F= '/^NAME/{print $2}' /etc/os-release | sed -e 's/^"//' -e 's/"$//')
VERSION_ID=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release | sed -e 's/^"//' -e 's/"$//')
if [[ "$OS_DISTRIBUTION" == "Ubuntu" ]]; then
    OS_VERSION=ubuntu${VERSION_ID}
elif [[ "$OS_DISTRIBUTION" == "CentOS Linux" ]]; then
    OS_VERSION=redhat${VERSION_ID}
else
    echo "OS distribution $OS_DISTRIBUTION not supported"
    exit 1
fi

# download hpcx package from blob storage
HPCX_VERSION="v2.13"
TARBALL="hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-${OS_VERSION}-cuda11-gdrcopy2-nccl2.12-x86_64.tbz"
CONTAINER_NAME="hpcx-packages"
HPCX_FOLDER=$(basename ${TARBALL} .tbz)
# Define YOUR URL LINK for downloading the file
HPCX_DOWNLOAD_URL="<YOUR_URL>/${CONTAINER_NAME}/${TARBALL}"

wget --no-verbose --retry-connrefused --tries=3 --waitretry=5 -O $TARBALL $HPCX_DOWNLOAD_URL
tar -xf ${TARBALL} && rm -rf ${TARBALL}

# copy over ompi and ucx
HPCX_DIR=/opt/hpcx
HPCX_MPI_DIR=${HPCX_DIR}/ompi
HPCX_UCX_DIR=${HPCX_DIR}/ucx
mkdir -p ${HPCX_DIR}
mv ${HPCX_FOLDER}/ompi ${HPCX_DIR}/ompi
mv ${HPCX_FOLDER}/ucx ${HPCX_DIR}/ucx
rm -rf ${HPCX_FOLDER}

# create symbolic links for /usr/local/mpi and /usr/local/
ln -sf ${HPCX_MPI_DIR} /usr/local/mpi
ln -sf ${HPCX_UCX_DIR} /usr/local/ucx
sed -i 's/^\(hwloc_base_binding_policy\) = core$/\1 = none/' ${HPCX_MPI_DIR}/etc/openmpi-mca-params.conf
sed -i 's/^\(btl = self\)$/#\1/' ${HPCX_MPI_DIR}/etc/openmpi-mca-params.conf

if [[ "$OS_DISTRIBUTION" == "CentOS Linux" ]] || [[ "$DSVM" == "True" ]]; then
    # Set openmpi and ucx related environment variables
    # Corresponding environment variables are using ENV in dockerfile
    # Any change here might require a change in the dockerfile too
    echo 'export OPAL_PREFIX=/opt/hpcx/ompi
export PATH=/usr/local/mpi/bin:${PATH}
export PATH=/usr/local/ucx/bin:${PATH}' >> /etc/profile.d/hpcx_ompi_ucx.sh
fi

# Enable OpenMPI's mpirun package to run as root
echo 'export OMPI_ALLOW_RUN_AS_ROOT=1' >> /root/.bashrc
echo 'export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1' >> /root/.bashrc

# set library path for openmpi and ucx, and run ldconfig
# using ldconf and .conf files to set paths to mpi correctly (even for non-root users)
echo "${HPCX_MPI_DIR}/lib
${HPCX_UCX_DIR}/lib" >> /etc/ld.so.conf.d/hpcx_ompi_ucx.conf
ldconfig

## install_msccl_mpi_torch.sh
DESIRED_PYTHON=cp38-cp38
pydir="/opt/python/$DESIRED_PYTHON"
export PATH="$pydir/bin:$PATH"

bash /install_hpcx_mpi_ucx.sh
source /etc/profile.d/hpcx_ompi_ucx.sh

# Install mpi4py (OpenMPI Python package)
python -m pip install mpi4py
# Test OpenMPI & mpi4py installation
mpiexec --allow-run-as-root -n 5 python -m mpi4py.bench helloworld

source /builder/manywheel/build.sh
	mkdir test_pytorch2
	cd test_pytorch2
	git clone https://github.com/pytorch/pytorch.git
	git clone https://github.com/pytorch/builder.git

	cd pytorch
	git checkout v2.0.0
	git submodule sync
	git submodule update --init --recursive
	cd ..

	cd builder
	git checkout release/2.0
	GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION=11.7 manywheel/build_docker.sh
	cd ..

	# Copy below two files in test_pytorch2 folder
	# Copy install_msccl_mpi_torch.sh
	# Copy install_hpcx_mpi_ucx.sh

	# REPLACE the directory path with your location
	# export BASE_FOLDER_PATH="/home/abjindal/test_pytorch2/"
	export BASE_FOLDER_PATH=<ENTER_YOUR_PATH>

	echo "Set Correct Directory Path before proceeding"

	chmod +x install_msccl_mpi_torch.sh
	cp install_msccl_mpi_torch.sh ${BASE_FOLDER_PATH}builder/manywheel/

	export PYTORCH_BUILD_VERSION=2.0.0
	export OVERRIDE_PACKAGE_VERSION=2.0.0
	export PYTORCH_BUILD_VERSION_PREFIX=$(echo ${PYTORCH_BUILD_VERSION} \| awk -F. '{print $1"."$2}')
	export DESIRED_CUDA=117
	export CUDA_HOME_PATH=/usr/local/cuda-${DESIRED_CUDA:0:2}.${DESIRED_CUDA:2:1}
	export DESIRED_PYTHON=3.8

	cd pytorch

	sudo docker run --rm --pull never -e USE_NCCL=1 -e NCCL_ROOT=/usr/local/cuda-11.7 -e NCCL_LIB_DIR=/usr/local/cuda-11.7/lib \
	-e NCCL_INCLUDE_DIR=/usr/local/cuda-11.7/include -e CUDA_HOME=$CUDA_HOME_PATH -e USE_DISTRIBUTED=1 \
	-e PYTORCH_FINAL_PACKAGE_DIR=/builder/conda/package -e SKIP_ALL_TESTS=1 -e BUILD_SPLIT_CUDA=ON \
	-e DESIRED_CUDA=${DESIRED_CUDA:0:2}.${DESIRED_CUDA:2:1} -e GPU_ARCH_TYPE=cuda -e DESIRED_PYTHON=$DESIRED_PYTHON \
	-e PYTORCH_BUILD_VERSION=$PYTORCH_BUILD_VERSION -e PYTORCH_BUILD_NUMBER=1 -e USE_CUPTI_SO=1 \
	-e OVERRIDE_PACKAGE_VERSION=$OVERRIDE_PACKAGE_VERSION -e PYTORCH_ROOT=/pytorch -e INSTALL_MPI=true -e USE_MPI=1 \
	-v ${BASE_FOLDER_PATH}/pytorch:/pytorch \
	-v ${BASE_FOLDER_PATH}builder:/builder -v ${BASE_FOLDER_PATH}install_hpcx_mpi_ucx.sh:/install_hpcx_mpi_ucx.sh \
	-v "$(pwd):/final_pkgs" -u root \
	pytorch/manylinux-cuda117 \
	/builder/manywheel/install_msccl_mpi_torch.sh
	#!/usr/bin/env bash
	set -eoux pipefail

	# use Mellanox-HPC-X package to install mpi and ucx to configure mpi correctly with cuda and build cuda aware mpi that
	# is needed for models in torch that use mpi as distributed backend
	# Details of package https://docs.nvidia.com/networking/pages/viewpage.action?pageId=12006256
	# AZHPC images use similar installation steps: https://github.com/Azure/azhpc-images/blob/master/ubuntu/common/install_mpis.sh
	# The HPCX packages are downloaded and stored in blob storage


	OS_DISTRIBUTION=$(awk -F= '/^NAME/{print $2}' /etc/os-release \| sed -e 's/^"//' -e 's/"$//')
	VERSION_ID=$(awk -F= '/^VERSION_ID/{print $2}' /etc/os-release \| sed -e 's/^"//' -e 's/"$//')
	if [[ "$OS_DISTRIBUTION" == "Ubuntu" ]]; then
	OS_VERSION=ubuntu${VERSION_ID}
	elif [[ "$OS_DISTRIBUTION" == "CentOS Linux" ]]; then
	OS_VERSION=redhat${VERSION_ID}
	else
	echo "OS distribution $OS_DISTRIBUTION not supported"
	exit 1
	fi

	# download hpcx package from blob storage
	HPCX_VERSION="v2.13"
	TARBALL="hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-${OS_VERSION}-cuda11-gdrcopy2-nccl2.12-x86_64.tbz"
	CONTAINER_NAME="hpcx-packages"
	HPCX_FOLDER=$(basename ${TARBALL} .tbz)
	# Define YOUR URL LINK for downloading the file
	HPCX_DOWNLOAD_URL="<YOUR_URL>/${CONTAINER_NAME}/${TARBALL}"

	wget --no-verbose --retry-connrefused --tries=3 --waitretry=5 -O $TARBALL $HPCX_DOWNLOAD_URL
	tar -xf ${TARBALL} && rm -rf ${TARBALL}

	# copy over ompi and ucx
	HPCX_DIR=/opt/hpcx
	HPCX_MPI_DIR=${HPCX_DIR}/ompi
	HPCX_UCX_DIR=${HPCX_DIR}/ucx
	mkdir -p ${HPCX_DIR}
	mv ${HPCX_FOLDER}/ompi ${HPCX_DIR}/ompi
	mv ${HPCX_FOLDER}/ucx ${HPCX_DIR}/ucx
	rm -rf ${HPCX_FOLDER}

	# create symbolic links for /usr/local/mpi and /usr/local/
	ln -sf ${HPCX_MPI_DIR} /usr/local/mpi
	ln -sf ${HPCX_UCX_DIR} /usr/local/ucx
	sed -i 's/^\(hwloc_base_binding_policy\) = core$/\1 = none/' ${HPCX_MPI_DIR}/etc/openmpi-mca-params.conf
	sed -i 's/^\(btl = self\)$/#\1/' ${HPCX_MPI_DIR}/etc/openmpi-mca-params.conf

	if [[ "$OS_DISTRIBUTION" == "CentOS Linux" ]] \|\| [[ "$DSVM" == "True" ]]; then
	# Set openmpi and ucx related environment variables
	# Corresponding environment variables are using ENV in dockerfile
	# Any change here might require a change in the dockerfile too
	echo 'export OPAL_PREFIX=/opt/hpcx/ompi
	export PATH=/usr/local/mpi/bin:${PATH}
	export PATH=/usr/local/ucx/bin:${PATH}' >> /etc/profile.d/hpcx_ompi_ucx.sh
	fi

	# Enable OpenMPI's mpirun package to run as root
	echo 'export OMPI_ALLOW_RUN_AS_ROOT=1' >> /root/.bashrc
	echo 'export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1' >> /root/.bashrc

	# set library path for openmpi and ucx, and run ldconfig
	# using ldconf and .conf files to set paths to mpi correctly (even for non-root users)
	echo "${HPCX_MPI_DIR}/lib
	${HPCX_UCX_DIR}/lib" >> /etc/ld.so.conf.d/hpcx_ompi_ucx.conf
	ldconfig
	DESIRED_PYTHON=cp38-cp38
	pydir="/opt/python/$DESIRED_PYTHON"
	export PATH="$pydir/bin:$PATH"

	bash /install_hpcx_mpi_ucx.sh
	source /etc/profile.d/hpcx_ompi_ucx.sh

	# Install mpi4py (OpenMPI Python package)
	python -m pip install mpi4py
	# Test OpenMPI & mpi4py installation
	mpiexec --allow-run-as-root -n 5 python -m mpi4py.bench helloworld

	source /builder/manywheel/build.sh