Hamid Shojanazeri HamidShojanazeri

## gist:5270fa2f21516f2175a415ddc683e6f5
(llama-cuda12) bash-5.1$ torchrun --nnodes 1 --nproc_per_node 4  recipes/finetuning/finetuning.py  --use_peft --peft_method lora  --model_name meta-llama/Llama-2-7b-chat-hf --enable_fsdp --use_fast_kernels --pure_bf16 --dist_checkpoint_root_folder ./Llama-2-70b-chat-hf/ --dist_checkpoint_folder fine-tuned
W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757]
W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] *****************************************
W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] *****************************************
Warning: unknown parameter pure_bf16
Warning: unknown parameter pure_bf16
Warning: unknown parameter pure_bf16
Warning: unknown parameter pure_bf16
C

## gist:37698454fec0a3c5beb083e77af66528
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/TensorRT-LLM/examples/server/server.py", line 5, in <module>
    import tensorrt_llm
  File "/TensorRT-LLM/tensorrt_llm/__init__.py", line 47, in <module>
    from .hlapi.llm import LLM, ModelConfig
  File "/TensorRT-LLM/tensorrt_llm/hlapi/__init__.py", line 1, in <module>

## gist:0b92941ff1506162b54a8170d4b6a788
normalizer_spec {
  name: "identity"
  precompiled_charsmap: ""
  add_dummy_prefix: true
  remove_extra_whitespaces: false
  normalization_rule_tsv: ""
}

trainer_spec {
  input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"

## gist:fd128b6bf1ab10e0729b4e7e996cd38a
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
import time
device = "cuda"
torch.set_float32_matmul_precision('high')
# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
	(llama-cuda12) bash-5.1$ torchrun --nnodes 1 --nproc_per_node 4 recipes/finetuning/finetuning.py --use_peft --peft_method lora --model_name meta-llama/Llama-2-7b-chat-hf --enable_fsdp --use_fast_kernels --pure_bf16 --dist_checkpoint_root_folder ./Llama-2-70b-chat-hf/ --dist_checkpoint_folder fine-tuned
	W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757]
	W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] *****************************************
	W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0408 09:24:54.152000 140136569627136 torch/distributed/run.py:757] *****************************************
	Warning: unknown parameter pure_bf16
	Warning: unknown parameter pure_bf16
	Warning: unknown parameter pure_bf16
	Warning: unknown parameter pure_bf16
	C
	Traceback (most recent call last):
	File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
	return _run_code(code, main_globals, None,
	File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
	exec(code, run_globals)
	File "/TensorRT-LLM/examples/server/server.py", line 5, in <module>
	import tensorrt_llm
	File "/TensorRT-LLM/tensorrt_llm/__init__.py", line 47, in <module>
	from .hlapi.llm import LLM, ModelConfig
	File "/TensorRT-LLM/tensorrt_llm/hlapi/__init__.py", line 1, in <module>
	normalizer_spec {
	name: "identity"
	precompiled_charsmap: ""
	add_dummy_prefix: true
	remove_extra_whitespaces: false
	normalization_rule_tsv: ""
	}

	trainer_spec {
	input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
	from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
	import torch
	from typing import Optional
	import time
	device = "cuda"
	torch.set_float32_matmul_precision('high')
	# Copied from the gpt-fast repo
	def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)