davidtorcivia/mimic_config.yaml

## mimic_config.yaml
# Machine Intelligence Made to Impersonate Characteristics: MIMIC

# NOTE run this $ conda install -c conda-forge mpi4py mpich to get mpi working
# accelerate launch --use_deepspeed -m axolotl.cli.train ./config_name_here
base_model: alpindale/Mistral-7B-v0.2-hf
base_model_config: alpindale/Mistral-7B-v0.2-hf
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
is_mistral_derived_model: true

load_in_8bit: false
load_in_4bit: false
strict: false

datasets:
  - path: json
    data_files: humanityendures.json
    ds_type: json
    type: completion
  - path: json
    data_files: gpt_data_export.jsonl
    ds_type: json
    type: sharegpt
  - path: json
    data_files: personal_notes_sharegpt.jsonl
    ds_type: json
    type: sharegpt
  - path: json
    data_files: substack_json_data.json
    ds_type: json
    type: sharegpt
dataset_prepared_path: mimic_run_prepared
output_dir: ./mimic_evan


sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true

wandb_project: mimic-experiment-1
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 6
eval_batch_size: 6
num_epochs: 3
optimizer: galore_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0000035
cosine_min_lr_ratio: 0
weight_decay: 0.1
# adamw hyperparams
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 0.00000001
# Gradient clipping max norm
max_grad_norm: 1.0
noisy_embedding_alpha: 5

optim_args:
#For Galore Optimizers the following optim_args are available
    rank: 256 # type: int
    update_proj_gap: 200  # type: int
    scale: 0.25  # type: float
    proj_type: "std" # type: str, default = std

optim_target_modules:
  - mlp
  - self_attn
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true

# fsdp:
  # - full_shard
  # - auto_wrap
# fsdp_config:
  # fsdp_offload_params: false
  # fsdp_state_dict_type: FULL_STATE_DICT
  # fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
warmup_steps: 10
auto_resume_from_checkpoints: false
#warmup_ratio: 0.5
eval_steps: 10
saves_per_epoch: 1
eval_sample_packing: false
save_total_limit: 2
debug:
deepspeed: deepspeed_configs/zero2.json
	# Machine Intelligence Made to Impersonate Characteristics: MIMIC

	# NOTE run this $ conda install -c conda-forge mpi4py mpich to get mpi working
	# accelerate launch --use_deepspeed -m axolotl.cli.train ./config_name_here
	base_model: alpindale/Mistral-7B-v0.2-hf
	base_model_config: alpindale/Mistral-7B-v0.2-hf
	model_type: MistralForCausalLM
	tokenizer_type: LlamaTokenizer
	is_mistral_derived_model: true

	load_in_8bit: false
	load_in_4bit: false
	strict: false

	datasets:
	- path: json
	data_files: humanityendures.json
	ds_type: json
	type: completion
	- path: json
	data_files: gpt_data_export.jsonl
	ds_type: json
	type: sharegpt
	- path: json
	data_files: personal_notes_sharegpt.jsonl
	ds_type: json
	type: sharegpt
	- path: json
	data_files: substack_json_data.json
	ds_type: json
	type: sharegpt
	dataset_prepared_path: mimic_run_prepared
	output_dir: ./mimic_evan



	sequence_len: 8192
	sample_packing: true
	pad_to_sequence_len: true

	wandb_project: mimic-experiment-1
	wandb_entity:
	wandb_watch:
	wandb_run_id:
	wandb_log_model:

	gradient_accumulation_steps: 1
	micro_batch_size: 6
	eval_batch_size: 6
	num_epochs: 3
	optimizer: galore_adamw_8bit
	lr_scheduler: cosine
	learning_rate: 0.0000035
	cosine_min_lr_ratio: 0
	weight_decay: 0.1
	# adamw hyperparams
	adam_beta1: 0.9
	adam_beta2: 0.999
	adam_epsilon: 0.00000001
	# Gradient clipping max norm
	max_grad_norm: 1.0
	noisy_embedding_alpha: 5

	optim_args:
	#For Galore Optimizers the following optim_args are available
	rank: 256 # type: int
	update_proj_gap: 200 # type: int
	scale: 0.25 # type: float
	proj_type: "std" # type: str, default = std

	optim_target_modules:
	- mlp
	- self_attn
	train_on_inputs: false
	group_by_length: false
	bf16: true
	fp16: false
	tf32: false

	gradient_checkpointing: true
	early_stopping_patience:
	resume_from_checkpoint:
	local_rank:
	logging_steps: 1
	xformers_attention:
	flash_attention: true

	# fsdp:
	# - full_shard
	# - auto_wrap
	# fsdp_config:
	# fsdp_offload_params: false
	# fsdp_state_dict_type: FULL_STATE_DICT
	# fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
	warmup_steps: 10
	auto_resume_from_checkpoints: false
	#warmup_ratio: 0.5
	eval_steps: 10
	saves_per_epoch: 1
	eval_sample_packing: false
	save_total_limit: 2
	debug:
	deepspeed: deepspeed_configs/zero2.json