Created
April 22, 2024 12:00
-
-
Save eavae/fd993ded22f57f03a966cb541243c3bb to your computer and use it in GitHub Desktop.
Simple DeepSpeed with Zero-1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import enum | |
import deepspeed | |
import argparse | |
from torch import nn | |
from torch.utils.data import Dataset, DataLoader | |
from torch.multiprocessing import spawn | |
class BoolOps(enum.Enum): | |
AND = 1 | |
OR = 2 | |
XOR = 3 | |
class BoolDataset(Dataset): | |
def __init__(self, size=128): | |
self.size = size | |
self.data = torch.randint(0, 2, (size, 2)) | |
self.ops = torch.randint(0, 3, (size,)) | |
self.labels = torch.zeros(size, dtype=torch.long) | |
for i in range(size): | |
if self.ops[i] == BoolOps.AND.value: | |
self.labels[i] = self.data[i].all() | |
elif self.ops[i] == BoolOps.OR.value: | |
self.labels[i] = self.data[i].any() | |
elif self.ops[i] == BoolOps.XOR.value: | |
self.labels[i] = self.data[i][0] ^ self.data[i][1] | |
def __len__(self): | |
return self.size | |
def __getitem__(self, idx): | |
return self.data[idx], self.ops[idx], self.labels[idx] | |
class Network(nn.Module): | |
def __init__(self, embed_dim=1024): | |
super().__init__() | |
self.cat_embedding = nn.Embedding(3, embed_dim, max_norm=1.0) | |
self.real_embedding = nn.Linear(2, embed_dim) | |
self.fc1 = nn.Linear(embed_dim, embed_dim * 2) | |
self.fc2 = nn.Linear(embed_dim * 2, 2) | |
def forward(self, bools, ops, labels): | |
x = self.cat_embedding(ops) + self.real_embedding(bools) | |
x = self.fc1(x) | |
y_ = self.fc2(x) | |
loss_fn = nn.CrossEntropyLoss() | |
return loss_fn(y_, labels) | |
def main(config): | |
deepspeed.init_distributed(dist_backend="nccl") | |
dataset = BoolDataset() | |
model = Network() | |
model_engine, optimizer, _, _ = deepspeed.initialize( | |
model=model, | |
model_parameters=model.parameters(), | |
config=config, | |
) | |
dataloader = DataLoader( | |
dataset, | |
batch_size=model_engine.train_batch_size(), | |
shuffle=True, | |
) | |
for bools, ops, labels in dataloader: | |
optimizer.zero_grad() | |
model_dtype, _ = model_engine.get_data_types() | |
loss = model_engine( | |
bools.to(model_engine.device, dtype=model_dtype), | |
ops.to(model_engine.device), | |
labels.to(model_engine.device), | |
) | |
model_engine.backward(loss) | |
model_engine.step() | |
# 统计优化器备份的模型权重大小 | |
print( | |
"Optimizer Backup Model Size: ", | |
sum( | |
p.numel() | |
for p in optimizer.state_dict()["single_partition_of_fp32_groups"] | |
), | |
) | |
# 统计 优化器状态-Momentum 的参数量大小 | |
print( | |
"Optimizer State Momentum Size: ", | |
sum( | |
state_value["exp_avg"].numel() | |
for state_value in optimizer.state_dict()["base_optimizer_state"][ | |
"state" | |
].values() | |
), | |
) | |
# 统计 优化器状态-Variance 的参数量大小 | |
print( | |
"Optimizer State Variance Size: ", | |
sum( | |
state_value["exp_avg_sq"].numel() | |
for state_value in optimizer.state_dict()["base_optimizer_state"][ | |
"state" | |
].values() | |
), | |
) | |
... | |
... | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--local_rank", type=int, default=0) | |
parser = deepspeed.add_config_arguments(parser) | |
args = parser.parse_args() | |
main(args.deepspeed_config) | |
""" | |
bash: | |
deepspeed example_deepspeed_zero_1.py --deepspeed --deepspeed_config deepspeed_config.json | |
deepseed_config.json: | |
{ | |
"train_micro_batch_size_per_gpu": 32, | |
"zero_optimization": { | |
"stage": 1 | |
}, | |
"optimizer": { | |
"type": "Adam", | |
"params": { | |
"lr": 1e-3 | |
} | |
} | |
} | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment