chenyaofo chenyaofo

## deepspeed-benchmark.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                chenyaofo
                / deepspeed-benchmark.md
            
            
              Last active
              May 3, 2024 12:53
            
              
                Throughput Benchmark based on deepspeed-based LLM traning code.
              
          
    We training LLM with the code and report the training speed of different settings (see the Table). We use a machine with A800 x 8, 1 TB CPU memory, Intel 8358 CPU x 2. For the software, we use CUDA 12.1, PyTorch 2.2.0, Deepspeed 0.14.2.

  Table. Benchmark of LLaMA-7B models using deepspeed-based traning code. The squence length is 4096.  


Zero Stage
Ckpt.[^1]
Optim. Off.[^2]
Param. Off.[^3]
Zero++[^4]
BS[^5]
CPU Mem.[^6]
GPU Mem.[^7]
Th.put


2
×
×
×
×
1/64
320.1
19.4/44.8
5.33


2
√
×
×
×
1/64
320.0
19.4/23.5
4.19


## download.py
#!/usr/bin/env python3

# Copyright 2021 The KubeEdge Authors.
# Copyright 2020 kubeflow.org.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0

## serialize_transforms.py
import torch.package as package
import torch
import torchvision.transforms as T


def get_train_transforms(crop_size, mean, std, is_training):
    pipelines = []
    if is_training:
        pipelines.append(T.RandomResizedCrop(crop_size))
        pipelines.append(T.RandomHorizontalFlip())

## config.yaml
port: 7890
socks-port: 7891
allow-lan: true
mode: Global
log-level: info
external-controller: :9090

profile:
  store-selected: true
  store-fake-ip: true

## llama-pipeline.py

import torch
import torch.nn.functional as F
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm, LlamaConfig, LlamaForCausalLM
import deepspeed
from deepspeed.pipe import PipelineModule, LayerSpec


class EmbeddingPipe(torch.nn.Embedding):
    def forward(self, args):

## engine.py
import pathlib
import loguru
import dataclasses

import deepspeed
import torch4x
from deepspeed import comm as dist


import pprint

## asyncio_read.py
import asyncio
import aiofiles

tar_filenames = [f"/home/chenyaofo/datasets/imagenet-wds/train/{i:06d}.tar" for i in range(256)]
# tar_filenames = [f"/gpfs01/home/chenyaofo/imagenet-wds/train/{i:06d}.tar" for i in range(256)]

count = 0

def async_reading():
    print("asyncio reading based on naive asyncio")

## Dockerfile
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    PATH=/opt/conda/bin:$PATH \
    PYTHON_VERSION=3.10


RUN APT_INSTALL="apt-get install -y --no-install-recommends --no-install-suggests" && \
    GIT_CLONE="git clone --depth 10" && \
    rm -rf /etc/apt/sources.list.d/cuda.list \

## Dockerfile-PyTorch-github-action
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    PATH=/opt/conda/bin:$PATH \
    TZ=Asia/Shanghai \
    PYTHON_VERSION=3.9

RUN APT_INSTALL="apt-get install -y --no-install-recommends --no-install-suggests" && \
    GIT_CLONE="git clone --depth 10" && \
    rm -rf /etc/apt/sources.list.d/cuda.list \

## load_image.py
import torch
import torchvision.transforms as transforms
from PIL import Image
from io import BytesIO
from torchdata.datapipes.iter import FileLister, FileOpener, TFRecordLoader, Mapper, Shuffler, Batcher, Collator, ShardingFilter
from torchdata.dataloader2 import adapter, DataLoader2, PrototypeMultiProcessingReadingService
from codebase.torchutils.serialization import jsonunpack

from torch.utils.data import DataLoader
def get_train_transforms():
Zero Stage	Ckpt.[^1]	Optim. Off.[^2]	Param. Off.[^3]	Zero++[^4]	BS[^5]	CPU Mem.[^6]	GPU Mem.[^7]	Th.put
2	×	×	×	×	1/64	320.1	19.4/44.8	5.33
2	√	×	×	×	1/64	320.0	19.4/23.5	4.19
	#!/usr/bin/env python3

	# Copyright 2021 The KubeEdge Authors.
	# Copyright 2020 kubeflow.org.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	import torch.package as package
	import torch
	import torchvision.transforms as T


	def get_train_transforms(crop_size, mean, std, is_training):
	pipelines = []
	if is_training:
	pipelines.append(T.RandomResizedCrop(crop_size))
	pipelines.append(T.RandomHorizontalFlip())
	port: 7890
	socks-port: 7891
	allow-lan: true
	mode: Global
	log-level: info
	external-controller: :9090

	profile:
	store-selected: true
	store-fake-ip: true

	import torch
	import torch.nn.functional as F
	from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm, LlamaConfig, LlamaForCausalLM
	import deepspeed
	from deepspeed.pipe import PipelineModule, LayerSpec


	class EmbeddingPipe(torch.nn.Embedding):
	def forward(self, args):
	import pathlib
	import loguru
	import dataclasses

	import deepspeed
	import torch4x
	from deepspeed import comm as dist


	import pprint
	import asyncio
	import aiofiles

	tar_filenames = [f"/home/chenyaofo/datasets/imagenet-wds/train/{i:06d}.tar" for i in range(256)]
	# tar_filenames = [f"/gpfs01/home/chenyaofo/imagenet-wds/train/{i:06d}.tar" for i in range(256)]

	count = 0

	def async_reading():
	print("asyncio reading based on naive asyncio")
	FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

	ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 \
	PATH=/opt/conda/bin:$PATH \
	PYTHON_VERSION=3.10


	RUN APT_INSTALL="apt-get install -y --no-install-recommends --no-install-suggests" && \
	GIT_CLONE="git clone --depth 10" && \
	rm -rf /etc/apt/sources.list.d/cuda.list \
	import torch
	import torchvision.transforms as transforms
	from PIL import Image
	from io import BytesIO
	from torchdata.datapipes.iter import FileLister, FileOpener, TFRecordLoader, Mapper, Shuffler, Batcher, Collator, ShardingFilter
	from torchdata.dataloader2 import adapter, DataLoader2, PrototypeMultiProcessingReadingService
	from codebase.torchutils.serialization import jsonunpack

	from torch.utils.data import DataLoader
	def get_train_transforms():