fearnworks/config.yml

## config.yml
# Changing this would change the base model used for training
base_model: huggyllama/llama-7b
# Changing this would change the configuration used for the base model
base_model_config: huggyllama/llama-7b
# Changing this would change the type of model used for training
model_type: LlamaForCausalLM
# Changing this would change the type of tokenizer used for tokenizing text data
tokenizer_type: LlamaTokenizer
# Changing this to false would prevent the model from being loaded in 8-bit precision
load_in_8bit: false
# Changing this to true would load the model in 4-bit precision
load_in_4bit: true
# Changing this to true would enforce strict loading of the base model's configuration
strict: false
# Changing this to true would push the dataset to Hugging Face's Hub during training
push_dataset_to_hub:
# Changing this would change the datasets used for training
datasets:
  - path: teknium/GPT4-LLM-Cleaned # Changing this would change the path to the dataset on Hugging Face's Hub
    type: alpaca # Changing this would change the type of dataset used for training
# Changing this would change the path where the prepared dataset is stored
dataset_prepared_path: last_run_prepared
# Changing this would change the size of the validation set as a fraction of the total dataset size
val_set_size: 0.02
# Changing this would change the type of adapter used during training
adapter: qlora
# Changing this would change the directory where the LoRA model is stored
lora_model_dir:
# Changing this would change the maximum sequence length for training examples
sequence_len: 256
# Changing this would change the maximum packed sequence length for training examples (if using packed sequences)
max_packed_sequence_len:
# Changing this would change the rank of LoRA's low-rank approximation matrices, which affects their expressiveness and memory usage during training
lora_r: 64
# Changing this would change LoRA's alpha hyperparameter, which controls how much weight is given to new information during training and affects how quickly LoRA adapters adapt to new data
lora_alpha: 32
#Changing this value will modify LoRA's dropout rate during training
lora_dropout: 0.0
#Changing these values will modify which modules in the base model are targeted with LoRA adapters during training (if left blank, all modules are targeted)
lora_target_modules:
#Changing this value to false will target all layers with LoRA adapters during training (if true, only linear layers are targeted)
lora_target_linear: true
#Changing this value to false will not use fan-in/fan-out initialization for LoRA adapters during training (if true, fan-in/fan-out initialization is used)
lora_fan_in_fan_out:
#Changing this value will modify name of Weights & Biases project to log training information to (if left blank, no logging is performed)
wandb_project:
#Changing this value to false will not log gradients and parameters to Weights & Biases during training (if true, gradients and parameters are logged)
wandb_watch:
#Changing this value will modify Weights & Biases run ID to resume logging from (if left blank, a new run is created)
wandb_run_id:
#Changing this value to false will not log trained model to Weights & Biases after training (if true, trained model is logged)
wandb_log_model:
#Changing this value will modify directory where trained models and other output files are saved after training
output_dir: ./qlora-out
batch_size: 2 #Changing batch size will modify how many examples are processed at once during training
micro_batch_size: 1 #Changing micro-batch size will modify how many examples are processed at once within each batch during training
num_epochs: 3 #Changing number of epochs will modify how many passes through entire dataset occur during training
optimizer: paged_adamw_32bit #Changing optimizer will modify which optimizer is used during training
torchdistx_path:
lr_scheduler: cosine #Changing learning rate scheduler will modify which learning rate scheduler is used during training
learning_rate: 0.0002 #Changing learning rate will modify initial learning rate for optimizer
train_on_inputs: false
group_by_length: false
bf16: false
fp16: true #Changing fp16 to false will not use fp16 precision during training
tf32: false
gradient_checkpointing: true #Changing gradient_checkpointing to false will not use gradient checkpointing during training
early_stopping_patience:
resume_from_checkpoint:
local_rank: 0
logging_steps: 1 #Changing logging_steps will modify how often logging should occur during training
xformers_attention:
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10 #Changing warmup_steps will modify number of warmup steps before learning rate reaches initial value
eval_steps: 50 #Changing eval_steps will modify how often evaluation should occur during training
save_steps:
debug:
device_map: auto
deepspeed:
weight_decay: 0.0 #Changing weight_decay will modify weight decay hyperparameter for optimizer
fsdp:
fsdp_config:
special_tokens: #Specifies special tokens used by tokenizer
  bos_token: "<s>" #Beginning of sentence token
  eos_token: "</s>" #End of sentence token
  unk_token: "<unk>" #Unknown token
	# Changing this would change the base model used for training
	base_model: huggyllama/llama-7b
	# Changing this would change the configuration used for the base model
	base_model_config: huggyllama/llama-7b
	# Changing this would change the type of model used for training
	model_type: LlamaForCausalLM
	# Changing this would change the type of tokenizer used for tokenizing text data
	tokenizer_type: LlamaTokenizer
	# Changing this to false would prevent the model from being loaded in 8-bit precision
	load_in_8bit: false
	# Changing this to true would load the model in 4-bit precision
	load_in_4bit: true
	# Changing this to true would enforce strict loading of the base model's configuration
	strict: false
	# Changing this to true would push the dataset to Hugging Face's Hub during training
	push_dataset_to_hub:
	# Changing this would change the datasets used for training
	datasets:
	- path: teknium/GPT4-LLM-Cleaned # Changing this would change the path to the dataset on Hugging Face's Hub
	type: alpaca # Changing this would change the type of dataset used for training
	# Changing this would change the path where the prepared dataset is stored
	dataset_prepared_path: last_run_prepared
	# Changing this would change the size of the validation set as a fraction of the total dataset size
	val_set_size: 0.02
	# Changing this would change the type of adapter used during training
	adapter: qlora
	# Changing this would change the directory where the LoRA model is stored
	lora_model_dir:
	# Changing this would change the maximum sequence length for training examples
	sequence_len: 256
	# Changing this would change the maximum packed sequence length for training examples (if using packed sequences)
	max_packed_sequence_len:
	# Changing this would change the rank of LoRA's low-rank approximation matrices, which affects their expressiveness and memory usage during training
	lora_r: 64
	# Changing this would change LoRA's alpha hyperparameter, which controls how much weight is given to new information during training and affects how quickly LoRA adapters adapt to new data
	lora_alpha: 32
	#Changing this value will modify LoRA's dropout rate during training
	lora_dropout: 0.0
	#Changing these values will modify which modules in the base model are targeted with LoRA adapters during training (if left blank, all modules are targeted)
	lora_target_modules:
	#Changing this value to false will target all layers with LoRA adapters during training (if true, only linear layers are targeted)
	lora_target_linear: true
	#Changing this value to false will not use fan-in/fan-out initialization for LoRA adapters during training (if true, fan-in/fan-out initialization is used)
	lora_fan_in_fan_out:
	#Changing this value will modify name of Weights & Biases project to log training information to (if left blank, no logging is performed)
	wandb_project:
	#Changing this value to false will not log gradients and parameters to Weights & Biases during training (if true, gradients and parameters are logged)
	wandb_watch:
	#Changing this value will modify Weights & Biases run ID to resume logging from (if left blank, a new run is created)
	wandb_run_id:
	#Changing this value to false will not log trained model to Weights & Biases after training (if true, trained model is logged)
	wandb_log_model:
	#Changing this value will modify directory where trained models and other output files are saved after training
	output_dir: ./qlora-out
	batch_size: 2 #Changing batch size will modify how many examples are processed at once during training
	micro_batch_size: 1 #Changing micro-batch size will modify how many examples are processed at once within each batch during training
	num_epochs: 3 #Changing number of epochs will modify how many passes through entire dataset occur during training
	optimizer: paged_adamw_32bit #Changing optimizer will modify which optimizer is used during training
	torchdistx_path:
	lr_scheduler: cosine #Changing learning rate scheduler will modify which learning rate scheduler is used during training
	learning_rate: 0.0002 #Changing learning rate will modify initial learning rate for optimizer
	train_on_inputs: false
	group_by_length: false
	bf16: false
	fp16: true #Changing fp16 to false will not use fp16 precision during training
	tf32: false
	gradient_checkpointing: true #Changing gradient_checkpointing to false will not use gradient checkpointing during training
	early_stopping_patience:
	resume_from_checkpoint:
	local_rank: 0
	logging_steps: 1 #Changing logging_steps will modify how often logging should occur during training
	xformers_attention:
	flash_attention:
	gptq_groupsize:
	gptq_model_v1:
	warmup_steps: 10 #Changing warmup_steps will modify number of warmup steps before learning rate reaches initial value
	eval_steps: 50 #Changing eval_steps will modify how often evaluation should occur during training
	save_steps:
	debug:
	device_map: auto
	deepspeed:
	weight_decay: 0.0 #Changing weight_decay will modify weight decay hyperparameter for optimizer
	fsdp:
	fsdp_config:
	special_tokens: #Specifies special tokens used by tokenizer
	bos_token: "<s>" #Beginning of sentence token
	eos_token: "</s>" #End of sentence token
	unk_token: "<unk>" #Unknown token