Skip to content

Instantly share code, notes, and snippets.

@Natooz
Last active August 2, 2023 15:59
Show Gist options
  • Save Natooz/3efd999d64825df667948a10ef44f638 to your computer and use it in GitHub Desktop.
Save Natooz/3efd999d64825df667948a10ef44f638 to your computer and use it in GitHub Desktop.
Estimate training time distributed (multi node / multi-GPU)
"""
Small script estimating the training time of models, depending on number of nodes / GPUs / batch size...
"""
from pprint import pformat
from math import ceil
def sec_to_hours(nb_sec: float, no_seconds: bool = False) -> str:
m, s = divmod(nb_sec, 60)
h, m = divmod(m, 60)
h, m, s = map(int, (h, m, s))
if no_seconds:
return f"{h}h:{m}m"
return f"{h}h:{m}m:{s}s"
# Constants
nb_train_samples = 5521826
nb_valid_samples = 55776
per_device_batch_size = 64
per_device_batch_size_valid = 64
nb_devices_per_node = 8
validation_steps = 1000
all_nb_nodes = range(1, 9)
all_nb_training_steps = (60000, 80000, 100000)
all_nb_epochs = (15, 20, 25)
# Inference times for 2 nodes and 64 batch size per device
# This will vary very little with a different nb of nodes > 1
sec_per_step = {
"360m": 1.61, # V100
"1b": 1.76, # A100
"3b": 4.03, # A100
}
sec_per_step_valid = {
"360m": 1 / 2.44, # V100
"1b": 1 / 2.49, # A100
"3b": 1 / 1.18, # A100
}
batch_sizes = {nb_nodes: nb_nodes * per_device_batch_size * nb_devices_per_node for nb_nodes in all_nb_nodes}
batch_sizes_valid = {nn: nn * per_device_batch_size_valid * nb_devices_per_node for nn in all_nb_nodes}
nb_steps_per_epoch = {nb_nodes: nb_train_samples / bsz for nb_nodes, bsz in batch_sizes.items()}
nb_steps_per_validation = {nb_nodes: nb_valid_samples / bsz for nb_nodes, bsz in batch_sizes_valid.items()}
total_nb_of_gpus = {nb_nodes: nb_nodes * nb_devices_per_node for nb_nodes in all_nb_nodes}
# A validation is performed every `validation_steps` training steps
print("WITH FIXED NB OF TRAINING STEPS:\n")
for nb_training_steps in all_nb_training_steps:
nb_ep_for_ts = {nb_nodes: round(nb_training_steps / spe, 2) for nb_nodes, spe in nb_steps_per_epoch.items()}
nb_validations = nb_training_steps // validation_steps
total_time_steps = {}
for size, sps in sec_per_step.items():
training_sec = nb_training_steps * sps
for nb_nodes in all_nb_nodes:
validation_sec = nb_validations * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
global_sec = validation_sec + training_sec
total_time_steps[f"{size} - {nb_nodes} nodes"] = \
f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
f" {batch_sizes[nb_nodes]} batch size - {nb_ep_for_ts[nb_nodes]} epochs"
print(f"Total time (whatever nb of node > 1) for {nb_training_steps} train steps:"
f"\n{pformat(total_time_steps, width=120)}\n")
# Here a validation is performed after each epoch is finished
print("WITH FIXED NB OF EPOCHS:\n")
for nb_epochs in all_nb_epochs:
total_time_ep = {}
for size, sps in sec_per_step.items():
for nb_nodes, spe in nb_steps_per_epoch.items():
training_sec = nb_epochs * spe * sps
validation_sec = nb_epochs * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
global_sec = validation_sec + training_sec
nb_training_steps_ = ceil(nb_epochs * nb_steps_per_epoch[nb_nodes])
total_time_ep[f"{size} - {nb_nodes} nodes"] = \
f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
f" {batch_sizes[nb_nodes]} batch size - {nb_training_steps_} train steps"
print(f"Total time for {nb_epochs} epochs:\n{pformat(total_time_ep, width=120)}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment