Last active
August 2, 2023 15:59
-
-
Save Natooz/3efd999d64825df667948a10ef44f638 to your computer and use it in GitHub Desktop.
Estimate training time distributed (multi node / multi-GPU)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Small script estimating the training time of models, depending on number of nodes / GPUs / batch size... | |
""" | |
from pprint import pformat | |
from math import ceil | |
def sec_to_hours(nb_sec: float, no_seconds: bool = False) -> str: | |
m, s = divmod(nb_sec, 60) | |
h, m = divmod(m, 60) | |
h, m, s = map(int, (h, m, s)) | |
if no_seconds: | |
return f"{h}h:{m}m" | |
return f"{h}h:{m}m:{s}s" | |
# Constants | |
nb_train_samples = 5521826 | |
nb_valid_samples = 55776 | |
per_device_batch_size = 64 | |
per_device_batch_size_valid = 64 | |
nb_devices_per_node = 8 | |
validation_steps = 1000 | |
all_nb_nodes = range(1, 9) | |
all_nb_training_steps = (60000, 80000, 100000) | |
all_nb_epochs = (15, 20, 25) | |
# Inference times for 2 nodes and 64 batch size per device | |
# This will vary very little with a different nb of nodes > 1 | |
sec_per_step = { | |
"360m": 1.61, # V100 | |
"1b": 1.76, # A100 | |
"3b": 4.03, # A100 | |
} | |
sec_per_step_valid = { | |
"360m": 1 / 2.44, # V100 | |
"1b": 1 / 2.49, # A100 | |
"3b": 1 / 1.18, # A100 | |
} | |
batch_sizes = {nb_nodes: nb_nodes * per_device_batch_size * nb_devices_per_node for nb_nodes in all_nb_nodes} | |
batch_sizes_valid = {nn: nn * per_device_batch_size_valid * nb_devices_per_node for nn in all_nb_nodes} | |
nb_steps_per_epoch = {nb_nodes: nb_train_samples / bsz for nb_nodes, bsz in batch_sizes.items()} | |
nb_steps_per_validation = {nb_nodes: nb_valid_samples / bsz for nb_nodes, bsz in batch_sizes_valid.items()} | |
total_nb_of_gpus = {nb_nodes: nb_nodes * nb_devices_per_node for nb_nodes in all_nb_nodes} | |
# A validation is performed every `validation_steps` training steps | |
print("WITH FIXED NB OF TRAINING STEPS:\n") | |
for nb_training_steps in all_nb_training_steps: | |
nb_ep_for_ts = {nb_nodes: round(nb_training_steps / spe, 2) for nb_nodes, spe in nb_steps_per_epoch.items()} | |
nb_validations = nb_training_steps // validation_steps | |
total_time_steps = {} | |
for size, sps in sec_per_step.items(): | |
training_sec = nb_training_steps * sps | |
for nb_nodes in all_nb_nodes: | |
validation_sec = nb_validations * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size] | |
global_sec = validation_sec + training_sec | |
total_time_steps[f"{size} - {nb_nodes} nodes"] = \ | |
f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \ | |
f" {batch_sizes[nb_nodes]} batch size - {nb_ep_for_ts[nb_nodes]} epochs" | |
print(f"Total time (whatever nb of node > 1) for {nb_training_steps} train steps:" | |
f"\n{pformat(total_time_steps, width=120)}\n") | |
# Here a validation is performed after each epoch is finished | |
print("WITH FIXED NB OF EPOCHS:\n") | |
for nb_epochs in all_nb_epochs: | |
total_time_ep = {} | |
for size, sps in sec_per_step.items(): | |
for nb_nodes, spe in nb_steps_per_epoch.items(): | |
training_sec = nb_epochs * spe * sps | |
validation_sec = nb_epochs * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size] | |
global_sec = validation_sec + training_sec | |
nb_training_steps_ = ceil(nb_epochs * nb_steps_per_epoch[nb_nodes]) | |
total_time_ep[f"{size} - {nb_nodes} nodes"] = \ | |
f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \ | |
f" {batch_sizes[nb_nodes]} batch size - {nb_training_steps_} train steps" | |
print(f"Total time for {nb_epochs} epochs:\n{pformat(total_time_ep, width=120)}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment