nctiggy/gist:fc7f34c48a2b2552b266c9f877d4f324

## gistfile1.txt
root@cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5:/intel-extension-for-transformers/optimum-habana/examples# python3 gaudi_spawn.py --world_size 2 --use_deepspeed language-modeling/run_lora_clm.py    --model_name_or_path meta-llama/Llama-2-7b-hf    --dataset_name tatsu-lab/alpaca    --bf16 True    --output_dir ./model_peft_output    --num_train_epochs 1    --per_device_train_batch_size 2    --per_device_eval_batch_size 2    --gradient_accumulation_steps 4    --evaluation_strategy no    --save_strategy steps    --save_steps 2000    --save_total_limit 1    --learning_rate 1e-4    --logging_steps 1    --dataset_concatenation    --do_train    --use_habana    --distribution_strategy fast_ddp    --use_lazy_mode    --throughput_warmup_steps 3
[WARNING|utils.py:185] 2024-04-02 16:22:51,269 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
[WARNING|utils.py:198] 2024-04-02 16:22:51,671 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
DistributedRunner run(): command = deepspeed --num_nodes 1 --num_gpus 2 --no_local_rank language-modeling/run_lora_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name tatsu-lab/alpaca --bf16 True --output_dir ./model_peft_output --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 4 --evaluation_strategy no --save_strategy steps --save_steps 2000 --save_total_limit 1 --learning_rate 1e-4 --logging_steps 1 --dataset_concatenation --do_train --use_habana --distribution_strategy fast_ddp --use_lazy_mode --throughput_warmup_steps 3
/usr/local/bin/deepspeed:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
  __import__('pkg_resources').require('deepspeed==0.10.3+hpu.synapse.v1.13.0')
/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/hpu/__init__.py:158: UserWarning: torch.hpu.setDeterministic is deprecated and will be removed in next release. Please use torch.use_deterministic_algorithms instead.
  warnings.warn(
[2024-04-02 16:22:53,124] [INFO] [real_accelerator.py:175:get_accelerator] Setting ds_accelerator to hpu (auto detect)
[2024-04-02 16:22:56,589] [WARNING] [runner.py:206:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
Detected CUDA_VISIBLE_DEVICES=-1 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
[2024-04-02 16:22:56,683] [INFO] [runner.py:585:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --no_local_rank --enable_each_rank_log=None language-modeling/run_lora_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name tatsu-lab/alpaca --bf16 True --output_dir ./model_peft_output --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 4 --evaluation_strategy no --save_strategy steps --save_steps 2000 --save_total_limit 1 --learning_rate 1e-4 --logging_steps 1 --dataset_concatenation --do_train --use_habana --distribution_strategy fast_ddp --use_lazy_mode --throughput_warmup_steps 3
/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/hpu/__init__.py:158: UserWarning: torch.hpu.setDeterministic is deprecated and will be removed in next release. Please use torch.use_deterministic_algorithms instead.
  warnings.warn(
[2024-04-02 16:22:58,054] [INFO] [real_accelerator.py:175:get_accelerator] Setting ds_accelerator to hpu (auto detect)
[2024-04-02 16:23:01,469] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
[2024-04-02 16:23:01,469] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
[2024-04-02 16:23:01,470] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
[2024-04-02 16:23:01,470] [INFO] [launch.py:164:main] dist_world_size=2
[2024-04-02 16:23:01,470] [INFO] [launch.py:166:main] Setting CUDA_VISIBLE_DEVICES=0,1
[WARNING|utils.py:185] 2024-04-02 16:23:04,351 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
[WARNING|utils.py:185] 2024-04-02 16:23:04,367 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
[WARNING|utils.py:198] 2024-04-02 16:23:04,954 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
[WARNING|utils.py:198] 2024-04-02 16:23:04,982 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
04/02/2024 16:23:06 - WARNING - __main__ -   Process rank: 1, device: hpu:1, distributed training: True, 16-bits training: True
04/02/2024 16:23:06 - WARNING - __main__ -   Process rank: 0, device: hpu:0, distributed training: True, 16-bits training: True
04/02/2024 16:23:06 - INFO - __main__ -   Training/evaluation parameters GaudiTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-06,
adjust_throughput=False,
auto_find_batch_size=False,
bf16=True,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=hccl,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=230,
ddp_find_unused_parameters=False,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tensor_cache_hpu_graphs=False,
disable_tqdm=False,
dispatch_batches=None,
distribution_strategy=fast_ddp,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fsdp_config=None,
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gaudi_config_name=None,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=hpu_amp,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
ignore_eos=True,
include_inputs_for_metrics=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=./model_peft_output/runs/Apr02_16-23-04_cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5,
logging_first_step=False,
logging_nan_inf_filter=False,
logging_steps=1.0,
logging_strategy=steps,
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_hpu_graphs=None,
max_steps=-1,
metric_for_best_model=None,
no_cuda=False,
non_blocking_data_copy=False,
num_train_epochs=1.0,
optim=adamw_torch,
optim_args=None,
output_dir=./model_peft_output,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=2,
per_device_train_batch_size=2,
pipelining_fwd_bwd=False,
prediction_loss_only=False,
profiling_record_shapes=True,
profiling_steps=0,
profiling_warmup_steps=0,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=./model_peft_output,
save_on_each_node=False,
save_safetensors=False,
save_steps=2000,
save_strategy=steps,
save_total_limit=1,
seed=42,
sharded_ddp=,
skip_memory_metrics=True,
throughput_warmup_steps=3,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
use_cpu=False,
use_habana=True,
use_hpu_graphs=False,
use_hpu_graphs_for_inference=False,
use_hpu_graphs_for_training=False,
use_ipex=False,
use_lazy_mode=True,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
)
[INFO|configuration_utils.py:715] 2024-04-02 16:23:06,295 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/config.json
[INFO|configuration_utils.py:775] 2024-04-02 16:23:06,296 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 32000
}

/usr/local/lib/python3.10/dist-packages/datasets/load.py:2089: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=None' instead.
  warnings.warn(
[INFO|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer.model
[INFO|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer.json
[INFO|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/special_tokens_map.json
[INFO|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer_config.json
/usr/local/lib/python3.10/dist-packages/datasets/load.py:2089: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=None' instead.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/datasets/table.py:1387: FutureWarning: promote has been superseded by promote_options='default'.
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
/usr/local/lib/python3.10/dist-packages/datasets/table.py:1387: FutureWarning: promote has been superseded by promote_options='default'.
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
[INFO|modeling_utils.py:2993] 2024-04-02 16:23:09,180 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model.safetensors.index.json
[INFO|modeling_utils.py:1220] 2024-04-02 16:23:09,181 >> Instantiating GaudiLlamaForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:770] 2024-04-02 16:23:09,181 >> Generate config GaudiGenerationConfig {
  "attn_softmax_bf16": null,
  "bos_token_id": 1,
  "bucket_size": -1,
  "eos_token_id": 2,
  "ignore_eos": null,
  "kv_cache_fp8": null,
  "limit_hpu_graphs": null,
  "reuse_cache": null,
  "static_shapes": null,
  "trim_logits": null
}

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.28it/s]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.40it/s]
[INFO|modeling_utils.py:3775] 2024-04-02 16:23:57,193 >> All model checkpoint weights were used when initializing GaudiLlamaForCausalLM.

[INFO|modeling_utils.py:3783] 2024-04-02 16:23:57,193 >> All the weights of GaudiLlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GaudiLlamaForCausalLM for predictions without further training.
[INFO|configuration_utils.py:730] 2024-04-02 16:23:57,336 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/generation_config.json
[INFO|configuration_utils.py:770] 2024-04-02 16:23:57,336 >> Generate config GaudiGenerationConfig {
  "attn_softmax_bf16": null,
  "bos_token_id": 1,
  "bucket_size": -1,
  "do_sample": true,
  "eos_token_id": 2,
  "ignore_eos": null,
  "kv_cache_fp8": null,
  "limit_hpu_graphs": null,
  "max_length": 4096,
  "pad_token_id": 0,
  "reuse_cache": null,
  "static_shapes": null,
  "temperature": 0.6,
  "top_p": 0.9,
  "trim_logits": null
}

04/02/2024 16:24:05 - INFO - __main__ -   Using data collator of type DataCollatorForLanguageModeling
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199
============================= HABANA PT BRIDGE CONFIGURATION ===========================
 PT_HPU_LAZY_MODE = 1
 PT_RECIPE_CACHE_PATH =
 PT_CACHE_FOLDER_DELETE = 0
 PT_HPU_RECIPE_CACHE_CONFIG =
 PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
 PT_HPU_LAZY_ACC_PAR_MODE = 0
 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM       : 1056451616 KB
------------------------------------------------------------------------------
/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:53: UserWarning: "hpu:X" notation is not supported by Gaudi PyTorch intergration bridge. Please change to "hpu" without index (Triggered internally at /npu-stack/pytorch-integration/pytorch_helpers/lazy_to_backend.cpp:53.)
  return super().__torch_function__(func, types, new_args, kwargs)
[INFO|trainer.py:672] 2024-04-02 16:24:10,697 >> ***** Running training *****
[INFO|trainer.py:673] 2024-04-02 16:24:10,697 >>   Num examples = 12,890
[INFO|trainer.py:674] 2024-04-02 16:24:10,697 >>   Num Epochs = 1
[INFO|trainer.py:675] 2024-04-02 16:24:10,697 >>   Instantaneous batch size per device = 2
[INFO|trainer.py:678] 2024-04-02 16:24:10,697 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:679] 2024-04-02 16:24:10,697 >>   Gradient Accumulation steps = 4
[INFO|trainer.py:680] 2024-04-02 16:24:10,697 >>   Total optimization steps = 805
[INFO|trainer.py:681] 2024-04-02 16:24:10,698 >>   Number of trainable parameters = 4,194,304
/home/jenkins/workspace/cdsoftwarebuilder/create-binaries-from-sw-sources---bp-dt/repos/hcl/src/interfaces/hcl_idevice.cpp::426(allocateConnection): The condition [ isNicUp(port) ] failed. Nic(9) is DOWN, can't allocate connection
/home/jenkins/workspace/cdsoftwarebuilder/create-binaries-from-sw-sources---bp-dt/repos/hcl/src/interfaces/hcl_idevice.cpp::426(allocateConnection): The condition [ isNicUp(port) ] failed. Nic(9) is DOWN, can't allocate connection
Traceback (most recent call last):
  File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 676, in <module>
    main()
  File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 646, in main
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 486, in train
    return inner_training_loop(
  File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 720, in _inner_training_loop
    torch.distributed.broadcast(param.data, src=0)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1907, in broadcast
    work = default_pg.broadcast([tensor], opts)
RuntimeError: Comm Init Rank Error
Traceback (most recent call last):
  File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 676, in <module>
    main()
  File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 646, in main
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 486, in train
    return inner_training_loop(
  File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 720, in _inner_training_loop
    torch.distributed.broadcast(param.data, src=0)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1907, in broadcast
    work = default_pg.broadcast([tensor], opts)
RuntimeError: Comm Init Rank Error
Internal Error: Received signal - Segmentation fault
Internal Error: Received signal - Segmentation fault
^CReceived Interrupt
[2024-04-02 16:24:59,828] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 11246
Traceback (most recent call last):
  File "/intel-extension-for-transformers/optimum-habana/examples/gaudi_spawn.py", line 109, in <module>
    main()
  File "/intel-extension-for-transformers/optimum-habana/examples/gaudi_spawn.py", line 104, in main
    ret_code = distributed_runner.run()
  File "/intel-extension-for-transformers/optimum-habana/optimum/habana/distributed/distributed_runner.py", line 214, in run
    proc.wait()
  File "/usr/lib/python3.10/subprocess.py", line 1209, in wait
    return self._wait(timeout=timeout)
  File "/usr/lib/python3.10/subprocess.py", line 1959, in _wait
    (pid, sts) = self._try_wait(0)
  File "/usr/lib/python3.10/subprocess.py", line 1917, in _try_wait
    (pid, sts) = os.waitpid(self.pid, wait_flags)
KeyboardInterrupt

root@cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5:/intel-extension-for-transformers/optimum-habana/examples# [2024-04-02 16:25:00,201] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 11247
[2024-04-02 16:25:00,202] [INFO] [launch.py:325:sigkill_handler] Main process received SIGINT, exiting
^C
	root@cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5:/intel-extension-for-transformers/optimum-habana/examples# python3 gaudi_spawn.py --world_size 2 --use_deepspeed language-modeling/run_lora_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name tatsu-lab/alpaca --bf16 True --output_dir ./model_peft_output --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 4 --evaluation_strategy no --save_strategy steps --save_steps 2000 --save_total_limit 1 --learning_rate 1e-4 --logging_steps 1 --dataset_concatenation --do_train --use_habana --distribution_strategy fast_ddp --use_lazy_mode --throughput_warmup_steps 3
	[WARNING\|utils.py:185] 2024-04-02 16:22:51,269 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
	[WARNING\|utils.py:198] 2024-04-02 16:22:51,671 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
	DistributedRunner run(): command = deepspeed --num_nodes 1 --num_gpus 2 --no_local_rank language-modeling/run_lora_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name tatsu-lab/alpaca --bf16 True --output_dir ./model_peft_output --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 4 --evaluation_strategy no --save_strategy steps --save_steps 2000 --save_total_limit 1 --learning_rate 1e-4 --logging_steps 1 --dataset_concatenation --do_train --use_habana --distribution_strategy fast_ddp --use_lazy_mode --throughput_warmup_steps 3
	/usr/local/bin/deepspeed:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
	__import__('pkg_resources').require('deepspeed==0.10.3+hpu.synapse.v1.13.0')
	/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/hpu/__init__.py:158: UserWarning: torch.hpu.setDeterministic is deprecated and will be removed in next release. Please use torch.use_deterministic_algorithms instead.
	warnings.warn(
	[2024-04-02 16:22:53,124] [INFO] [real_accelerator.py:175:get_accelerator] Setting ds_accelerator to hpu (auto detect)
	[2024-04-02 16:22:56,589] [WARNING] [runner.py:206:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
	Detected CUDA_VISIBLE_DEVICES=-1 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
	[2024-04-02 16:22:56,683] [INFO] [runner.py:585:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --no_local_rank --enable_each_rank_log=None language-modeling/run_lora_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name tatsu-lab/alpaca --bf16 True --output_dir ./model_peft_output --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 4 --evaluation_strategy no --save_strategy steps --save_steps 2000 --save_total_limit 1 --learning_rate 1e-4 --logging_steps 1 --dataset_concatenation --do_train --use_habana --distribution_strategy fast_ddp --use_lazy_mode --throughput_warmup_steps 3
	/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/hpu/__init__.py:158: UserWarning: torch.hpu.setDeterministic is deprecated and will be removed in next release. Please use torch.use_deterministic_algorithms instead.
	warnings.warn(
	[2024-04-02 16:22:58,054] [INFO] [real_accelerator.py:175:get_accelerator] Setting ds_accelerator to hpu (auto detect)
	[2024-04-02 16:23:01,469] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
	[2024-04-02 16:23:01,469] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
	[2024-04-02 16:23:01,470] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
	[2024-04-02 16:23:01,470] [INFO] [launch.py:164:main] dist_world_size=2
	[2024-04-02 16:23:01,470] [INFO] [launch.py:166:main] Setting CUDA_VISIBLE_DEVICES=0,1
	[WARNING\|utils.py:185] 2024-04-02 16:23:04,351 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
	[WARNING\|utils.py:185] 2024-04-02 16:23:04,367 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but habana-frameworks v1.14.0.493 was found, this could lead to undefined behavior!
	[WARNING\|utils.py:198] 2024-04-02 16:23:04,954 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
	Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
	[WARNING\|utils.py:198] 2024-04-02 16:23:04,982 >> optimum-habana v1.9.0.dev0 has been validated for SynapseAI v1.12.0 but the driver version is v1.15.0, this could lead to undefined behavior!
	Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
	04/02/2024 16:23:06 - WARNING - __main__ - Process rank: 1, device: hpu:1, distributed training: True, 16-bits training: True
	04/02/2024 16:23:06 - WARNING - __main__ - Process rank: 0, device: hpu:0, distributed training: True, 16-bits training: True
	04/02/2024 16:23:06 - INFO - __main__ - Training/evaluation parameters GaudiTrainingArguments(
	_n_gpu=1,
	adafactor=False,
	adam_beta1=0.9,
	adam_beta2=0.999,
	adam_epsilon=1e-06,
	adjust_throughput=False,
	auto_find_batch_size=False,
	bf16=True,
	data_seed=None,
	dataloader_drop_last=False,
	dataloader_num_workers=0,
	dataloader_pin_memory=True,
	ddp_backend=hccl,
	ddp_broadcast_buffers=None,
	ddp_bucket_cap_mb=230,
	ddp_find_unused_parameters=False,
	ddp_timeout=1800,
	debug=[],
	deepspeed=None,
	disable_tensor_cache_hpu_graphs=False,
	disable_tqdm=False,
	dispatch_batches=None,
	distribution_strategy=fast_ddp,
	do_eval=False,
	do_predict=False,
	do_train=True,
	eval_accumulation_steps=None,
	eval_delay=0,
	eval_steps=None,
	evaluation_strategy=no,
	fsdp_config=None,
	fsdp_min_num_params=0,
	fsdp_transformer_layer_cls_to_wrap=None,
	full_determinism=False,
	gaudi_config_name=None,
	gradient_accumulation_steps=4,
	gradient_checkpointing=False,
	greater_is_better=None,
	group_by_length=False,
	half_precision_backend=hpu_amp,
	hub_always_push=False,
	hub_model_id=None,
	hub_private_repo=False,
	hub_strategy=every_save,
	hub_token=<HUB_TOKEN>,
	ignore_data_skip=False,
	ignore_eos=True,
	include_inputs_for_metrics=False,
	include_tokens_per_second=False,
	jit_mode_eval=False,
	label_names=None,
	label_smoothing_factor=0.0,
	learning_rate=0.0001,
	length_column_name=length,
	load_best_model_at_end=False,
	local_rank=0,
	log_level=passive,
	log_level_replica=warning,
	log_on_each_node=True,
	logging_dir=./model_peft_output/runs/Apr02_16-23-04_cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5,
	logging_first_step=False,
	logging_nan_inf_filter=False,
	logging_steps=1.0,
	logging_strategy=steps,
	lr_scheduler_type=linear,
	max_grad_norm=1.0,
	max_hpu_graphs=None,
	max_steps=-1,
	metric_for_best_model=None,
	no_cuda=False,
	non_blocking_data_copy=False,
	num_train_epochs=1.0,
	optim=adamw_torch,
	optim_args=None,
	output_dir=./model_peft_output,
	overwrite_output_dir=False,
	past_index=-1,
	per_device_eval_batch_size=2,
	per_device_train_batch_size=2,
	pipelining_fwd_bwd=False,
	prediction_loss_only=False,
	profiling_record_shapes=True,
	profiling_steps=0,
	profiling_warmup_steps=0,
	push_to_hub=False,
	push_to_hub_model_id=None,
	push_to_hub_organization=None,
	push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
	ray_scope=last,
	remove_unused_columns=True,
	report_to=['tensorboard'],
	resume_from_checkpoint=None,
	run_name=./model_peft_output,
	save_on_each_node=False,
	save_safetensors=False,
	save_steps=2000,
	save_strategy=steps,
	save_total_limit=1,
	seed=42,
	sharded_ddp=,
	skip_memory_metrics=True,
	throughput_warmup_steps=3,
	torch_compile=False,
	torch_compile_backend=None,
	torch_compile_mode=None,
	torchdynamo=None,
	use_cpu=False,
	use_habana=True,
	use_hpu_graphs=False,
	use_hpu_graphs_for_inference=False,
	use_hpu_graphs_for_training=False,
	use_ipex=False,
	use_lazy_mode=True,
	use_legacy_prediction_loop=False,
	use_mps_device=False,
	warmup_ratio=0.0,
	warmup_steps=0,
	weight_decay=0.0,
	)
	[INFO\|configuration_utils.py:715] 2024-04-02 16:23:06,295 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/config.json
	[INFO\|configuration_utils.py:775] 2024-04-02 16:23:06,296 >> Model config LlamaConfig {
	"_name_or_path": "meta-llama/Llama-2-7b-hf",
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"bos_token_id": 1,
	"eos_token_id": 2,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 11008,
	"max_position_embeddings": 4096,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 32,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": null,
	"rope_theta": 10000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "float16",
	"transformers_version": "4.34.1",
	"use_cache": true,
	"vocab_size": 32000
	}

	/usr/local/lib/python3.10/dist-packages/datasets/load.py:2089: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
	You can remove this warning by passing 'token=None' instead.
	warnings.warn(
	[INFO\|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer.model
	[INFO\|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer.json
	[INFO\|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file added_tokens.json from cache at None
	[INFO\|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/special_tokens_map.json
	[INFO\|tokenization_utils_base.py:2015] 2024-04-02 16:23:06,488 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/tokenizer_config.json
	/usr/local/lib/python3.10/dist-packages/datasets/load.py:2089: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
	You can remove this warning by passing 'token=None' instead.
	warnings.warn(
	/usr/local/lib/python3.10/dist-packages/datasets/table.py:1387: FutureWarning: promote has been superseded by promote_options='default'.
	return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
	/usr/local/lib/python3.10/dist-packages/datasets/table.py:1387: FutureWarning: promote has been superseded by promote_options='default'.
	return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
	[INFO\|modeling_utils.py:2993] 2024-04-02 16:23:09,180 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model.safetensors.index.json
	[INFO\|modeling_utils.py:1220] 2024-04-02 16:23:09,181 >> Instantiating GaudiLlamaForCausalLM model under default dtype torch.bfloat16.
	[INFO\|configuration_utils.py:770] 2024-04-02 16:23:09,181 >> Generate config GaudiGenerationConfig {
	"attn_softmax_bf16": null,
	"bos_token_id": 1,
	"bucket_size": -1,
	"eos_token_id": 2,
	"ignore_eos": null,
	"kv_cache_fp8": null,
	"limit_hpu_graphs": null,
	"reuse_cache": null,
	"static_shapes": null,
	"trim_logits": null
	}

	Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:01<00:00, 1.28it/s]
	Loading checkpoint shards: 100%\|█████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:01<00:00, 1.40it/s]
	[INFO\|modeling_utils.py:3775] 2024-04-02 16:23:57,193 >> All model checkpoint weights were used when initializing GaudiLlamaForCausalLM.

	[INFO\|modeling_utils.py:3783] 2024-04-02 16:23:57,193 >> All the weights of GaudiLlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use GaudiLlamaForCausalLM for predictions without further training.
	[INFO\|configuration_utils.py:730] 2024-04-02 16:23:57,336 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/generation_config.json
	[INFO\|configuration_utils.py:770] 2024-04-02 16:23:57,336 >> Generate config GaudiGenerationConfig {
	"attn_softmax_bf16": null,
	"bos_token_id": 1,
	"bucket_size": -1,
	"do_sample": true,
	"eos_token_id": 2,
	"ignore_eos": null,
	"kv_cache_fp8": null,
	"limit_hpu_graphs": null,
	"max_length": 4096,
	"pad_token_id": 0,
	"reuse_cache": null,
	"static_shapes": null,
	"temperature": 0.6,
	"top_p": 0.9,
	"trim_logits": null
	}

	04/02/2024 16:24:05 - INFO - __main__ - Using data collator of type DataCollatorForLanguageModeling
	trainable params: 4,194,304 \|\| all params: 6,742,609,920 \|\| trainable%: 0.06220594176090199
	trainable params: 4,194,304 \|\| all params: 6,742,609,920 \|\| trainable%: 0.06220594176090199
	============================= HABANA PT BRIDGE CONFIGURATION ===========================
	PT_HPU_LAZY_MODE = 1
	PT_RECIPE_CACHE_PATH =
	PT_CACHE_FOLDER_DELETE = 0
	PT_HPU_RECIPE_CACHE_CONFIG =
	PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
	PT_HPU_LAZY_ACC_PAR_MODE = 0
	PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
	---------------------------: System Configuration :---------------------------
	Num CPU Cores : 160
	CPU RAM : 1056451616 KB
	------------------------------------------------------------------------------
	/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:53: UserWarning: "hpu:X" notation is not supported by Gaudi PyTorch intergration bridge. Please change to "hpu" without index (Triggered internally at /npu-stack/pytorch-integration/pytorch_helpers/lazy_to_backend.cpp:53.)
	return super().__torch_function__(func, types, new_args, kwargs)
	[INFO\|trainer.py:672] 2024-04-02 16:24:10,697 >> *** Running training ***
	[INFO\|trainer.py:673] 2024-04-02 16:24:10,697 >> Num examples = 12,890
	[INFO\|trainer.py:674] 2024-04-02 16:24:10,697 >> Num Epochs = 1
	[INFO\|trainer.py:675] 2024-04-02 16:24:10,697 >> Instantaneous batch size per device = 2
	[INFO\|trainer.py:678] 2024-04-02 16:24:10,697 >> Total train batch size (w. parallel, distributed & accumulation) = 16
	[INFO\|trainer.py:679] 2024-04-02 16:24:10,697 >> Gradient Accumulation steps = 4
	[INFO\|trainer.py:680] 2024-04-02 16:24:10,697 >> Total optimization steps = 805
	[INFO\|trainer.py:681] 2024-04-02 16:24:10,698 >> Number of trainable parameters = 4,194,304
	/home/jenkins/workspace/cdsoftwarebuilder/create-binaries-from-sw-sources---bp-dt/repos/hcl/src/interfaces/hcl_idevice.cpp::426(allocateConnection): The condition [ isNicUp(port) ] failed. Nic(9) is DOWN, can't allocate connection
	/home/jenkins/workspace/cdsoftwarebuilder/create-binaries-from-sw-sources---bp-dt/repos/hcl/src/interfaces/hcl_idevice.cpp::426(allocateConnection): The condition [ isNicUp(port) ] failed. Nic(9) is DOWN, can't allocate connection
	Traceback (most recent call last):
	File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 676, in <module>
	main()
	File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 646, in main
	train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
	File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 486, in train
	return inner_training_loop(
	File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 720, in _inner_training_loop
	torch.distributed.broadcast(param.data, src=0)
	File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
	return func(args, *kwargs)
	File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1907, in broadcast
	work = default_pg.broadcast([tensor], opts)
	RuntimeError: Comm Init Rank Error
	Traceback (most recent call last):
	File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 676, in <module>
	main()
	File "/intel-extension-for-transformers/optimum-habana/examples/language-modeling/run_lora_clm.py", line 646, in main
	train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
	File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 486, in train
	return inner_training_loop(
	File "/intel-extension-for-transformers/optimum-habana/optimum/habana/transformers/trainer.py", line 720, in _inner_training_loop
	torch.distributed.broadcast(param.data, src=0)
	File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
	return func(args, *kwargs)
	File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1907, in broadcast
	work = default_pg.broadcast([tensor], opts)
	RuntimeError: Comm Init Rank Error
	Internal Error: Received signal - Segmentation fault
	Internal Error: Received signal - Segmentation fault
	^CReceived Interrupt
	[2024-04-02 16:24:59,828] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 11246
	Traceback (most recent call last):
	File "/intel-extension-for-transformers/optimum-habana/examples/gaudi_spawn.py", line 109, in <module>
	main()
	File "/intel-extension-for-transformers/optimum-habana/examples/gaudi_spawn.py", line 104, in main
	ret_code = distributed_runner.run()
	File "/intel-extension-for-transformers/optimum-habana/optimum/habana/distributed/distributed_runner.py", line 214, in run
	proc.wait()
	File "/usr/lib/python3.10/subprocess.py", line 1209, in wait
	return self._wait(timeout=timeout)
	File "/usr/lib/python3.10/subprocess.py", line 1959, in _wait
	(pid, sts) = self._try_wait(0)
	File "/usr/lib/python3.10/subprocess.py", line 1917, in _try_wait
	(pid, sts) = os.waitpid(self.pid, wait_flags)
	KeyboardInterrupt

	root@cnvrg-job-notebooksession-cx4mdtm3m6bwfbvaesss-1-5b9dbf979pzxs5:/intel-extension-for-transformers/optimum-habana/examples# [2024-04-02 16:25:00,201] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 11247
	[2024-04-02 16:25:00,202] [INFO] [launch.py:325:sigkill_handler] Main process received SIGINT, exiting
	^C