Natooz/estimate_training_time.py

## estimate_training_time.py
"""
Small script estimating the training time of models, depending on number of nodes / GPUs / batch size...
"""


from pprint import pformat
from math import ceil


def sec_to_hours(nb_sec: float, no_seconds: bool = False) -> str:
    m, s = divmod(nb_sec, 60)
    h, m = divmod(m, 60)
    h, m, s = map(int, (h, m, s))
    if no_seconds:
        return f"{h}h:{m}m"
    return f"{h}h:{m}m:{s}s"


# Constants
nb_train_samples = 5521826
nb_valid_samples = 55776
per_device_batch_size = 64
per_device_batch_size_valid = 64
nb_devices_per_node = 8
validation_steps = 1000
all_nb_nodes = range(1, 9)
all_nb_training_steps = (60000, 80000, 100000)
all_nb_epochs = (15, 20, 25)

# Inference times for 2 nodes and 64 batch size per device
# This will vary very little with a different nb of nodes > 1
sec_per_step = {
    "360m": 1.61,  # V100
    "1b": 1.76,  # A100
    "3b": 4.03,  # A100
}
sec_per_step_valid = {
    "360m": 1 / 2.44,  # V100
    "1b": 1 / 2.49,  # A100
    "3b": 1 / 1.18,  # A100
}

batch_sizes = {nb_nodes: nb_nodes * per_device_batch_size * nb_devices_per_node for nb_nodes in all_nb_nodes}
batch_sizes_valid = {nn: nn * per_device_batch_size_valid * nb_devices_per_node for nn in all_nb_nodes}
nb_steps_per_epoch = {nb_nodes: nb_train_samples / bsz for nb_nodes, bsz in batch_sizes.items()}
nb_steps_per_validation = {nb_nodes: nb_valid_samples / bsz for nb_nodes, bsz in batch_sizes_valid.items()}
total_nb_of_gpus = {nb_nodes: nb_nodes * nb_devices_per_node for nb_nodes in all_nb_nodes}

# A validation is performed every `validation_steps` training steps
print("WITH FIXED NB OF TRAINING STEPS:\n")
for nb_training_steps in all_nb_training_steps:
    nb_ep_for_ts = {nb_nodes: round(nb_training_steps / spe, 2) for nb_nodes, spe in nb_steps_per_epoch.items()}
    nb_validations = nb_training_steps // validation_steps
    total_time_steps = {}
    for size, sps in sec_per_step.items():
        training_sec = nb_training_steps * sps
        for nb_nodes in all_nb_nodes:
            validation_sec = nb_validations * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
            global_sec = validation_sec + training_sec

            total_time_steps[f"{size} - {nb_nodes} nodes"] = \
                f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
                f" {batch_sizes[nb_nodes]} batch size - {nb_ep_for_ts[nb_nodes]} epochs"

    print(f"Total time (whatever nb of node > 1) for {nb_training_steps} train steps:"
          f"\n{pformat(total_time_steps, width=120)}\n")

# Here a validation is performed after each epoch is finished
print("WITH FIXED NB OF EPOCHS:\n")
for nb_epochs in all_nb_epochs:
    total_time_ep = {}
    for size, sps in sec_per_step.items():
        for nb_nodes, spe in nb_steps_per_epoch.items():
            training_sec = nb_epochs * spe * sps
            validation_sec = nb_epochs * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
            global_sec = validation_sec + training_sec
            nb_training_steps_ = ceil(nb_epochs * nb_steps_per_epoch[nb_nodes])

            total_time_ep[f"{size} - {nb_nodes} nodes"] = \
                f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
                f" {batch_sizes[nb_nodes]} batch size - {nb_training_steps_} train steps"

    print(f"Total time for {nb_epochs} epochs:\n{pformat(total_time_ep, width=120)}\n")
	"""
	Small script estimating the training time of models, depending on number of nodes / GPUs / batch size...
	"""


	from pprint import pformat
	from math import ceil


	def sec_to_hours(nb_sec: float, no_seconds: bool = False) -> str:
	m, s = divmod(nb_sec, 60)
	h, m = divmod(m, 60)
	h, m, s = map(int, (h, m, s))
	if no_seconds:
	return f"{h}h:{m}m"
	return f"{h}h:{m}m:{s}s"


	# Constants
	nb_train_samples = 5521826
	nb_valid_samples = 55776
	per_device_batch_size = 64
	per_device_batch_size_valid = 64
	nb_devices_per_node = 8
	validation_steps = 1000
	all_nb_nodes = range(1, 9)
	all_nb_training_steps = (60000, 80000, 100000)
	all_nb_epochs = (15, 20, 25)

	# Inference times for 2 nodes and 64 batch size per device
	# This will vary very little with a different nb of nodes > 1
	sec_per_step = {
	"360m": 1.61, # V100
	"1b": 1.76, # A100
	"3b": 4.03, # A100
	}
	sec_per_step_valid = {
	"360m": 1 / 2.44, # V100
	"1b": 1 / 2.49, # A100
	"3b": 1 / 1.18, # A100
	}

	batch_sizes = {nb_nodes: nb_nodes * per_device_batch_size * nb_devices_per_node for nb_nodes in all_nb_nodes}
	batch_sizes_valid = {nn: nn * per_device_batch_size_valid * nb_devices_per_node for nn in all_nb_nodes}
	nb_steps_per_epoch = {nb_nodes: nb_train_samples / bsz for nb_nodes, bsz in batch_sizes.items()}
	nb_steps_per_validation = {nb_nodes: nb_valid_samples / bsz for nb_nodes, bsz in batch_sizes_valid.items()}
	total_nb_of_gpus = {nb_nodes: nb_nodes * nb_devices_per_node for nb_nodes in all_nb_nodes}

	# A validation is performed every `validation_steps` training steps
	print("WITH FIXED NB OF TRAINING STEPS:\n")
	for nb_training_steps in all_nb_training_steps:
	nb_ep_for_ts = {nb_nodes: round(nb_training_steps / spe, 2) for nb_nodes, spe in nb_steps_per_epoch.items()}
	nb_validations = nb_training_steps // validation_steps
	total_time_steps = {}
	for size, sps in sec_per_step.items():
	training_sec = nb_training_steps * sps
	for nb_nodes in all_nb_nodes:
	validation_sec = nb_validations * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
	global_sec = validation_sec + training_sec

	total_time_steps[f"{size} - {nb_nodes} nodes"] = \
	f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
	f" {batch_sizes[nb_nodes]} batch size - {nb_ep_for_ts[nb_nodes]} epochs"

	print(f"Total time (whatever nb of node > 1) for {nb_training_steps} train steps:"
	f"\n{pformat(total_time_steps, width=120)}\n")

	# Here a validation is performed after each epoch is finished
	print("WITH FIXED NB OF EPOCHS:\n")
	for nb_epochs in all_nb_epochs:
	total_time_ep = {}
	for size, sps in sec_per_step.items():
	for nb_nodes, spe in nb_steps_per_epoch.items():
	training_sec = nb_epochs * spe * sps
	validation_sec = nb_epochs * nb_steps_per_validation[nb_nodes] * sec_per_step_valid[size]
	global_sec = validation_sec + training_sec
	nb_training_steps_ = ceil(nb_epochs * nb_steps_per_epoch[nb_nodes])

	total_time_ep[f"{size} - {nb_nodes} nodes"] = \
	f"{sec_to_hours(global_sec)} ({sec_to_hours(global_sec * total_nb_of_gpus[nb_nodes], True)} GPU hs) -" \
	f" {batch_sizes[nb_nodes]} batch size - {nb_training_steps_} train steps"

	print(f"Total time for {nb_epochs} epochs:\n{pformat(total_time_ep, width=120)}\n")