Last active
June 19, 2023 10:30
-
-
Save fredi-python/9c762054d8fe2e9f907e139a86056be6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1b: tiiuae/falcon-rw-1b | |
# 40b: tiiuae/falcon-40b | |
base_model: togethercomputer/RedPajama-INCITE-7B-Instruct | |
base_model_config: togethercomputer/RedPajama-INCITE-7B-Instruct | |
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main | |
model_type: AutoModelForCausalLM | |
tokenizer_type: AutoTokenizer | |
load_in_8bit: false | |
# enable 4bit for QLoRA | |
load_in_4bit: true | |
gptq: false | |
strict: false | |
push_dataset_to_hub: | |
datasets: | |
- path: laion/OIG | |
data_files: | |
- unified_chip2.jsonl | |
type: completion | |
dataset_prepared_path: last_run_prepared | |
val_set_size: 0.01 | |
# enable QLoRA | |
adapter: qlora | |
lora_model_dir: | |
sequence_len: 2048 | |
max_packed_sequence_len: | |
# hyperparameters from QLoRA paper Appendix B.2 | |
# "We find hyperparameters to be largely robust across datasets" | |
lora_r: 64 | |
lora_alpha: 16 | |
# 0.1 for models up to 13B | |
# 0.05 for 33B and 65B models | |
lora_dropout: 0.05 | |
# add LoRA modules on all linear layers of the base model | |
lora_target_modules: | |
lora_target_linear: true | |
lora_fan_in_fan_out: | |
wandb_project: | |
wandb_watch: | |
wandb_run_id: | |
wandb_log_model: | |
output_dir: ./qlora-out | |
# QLoRA paper Table 9 | |
# - 16 for 7b & 13b | |
# - 32 for 33b, 64 for 64b | |
# Max size tested on A6000 | |
# - 7b: 40 | |
# - 40b: 4 | |
# decrease if OOM, increase for max VRAM utilization | |
micro_batch_size: 1 | |
gradient_accumulation_steps: 2 | |
num_epochs: 3 | |
max_steps: 3000 | |
# Optimizer for QLoRA | |
optimizer: paged_adamw_32bit | |
torchdistx_path: | |
lr_scheduler: cosine | |
# QLoRA paper Table 9 | |
# - 2e-4 for 7b & 13b | |
# - 1e-4 for 33b & 64b | |
learning_rate: 0.0002 | |
train_on_inputs: false | |
group_by_length: false | |
bf16: false | |
fp16: false | |
tf32: false | |
gradient_checkpointing: true | |
# stop training after this many evaluation losses have increased in a row | |
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback | |
early_stopping_patience: 3 | |
resume_from_checkpoint: | |
auto_resume_from_checkpoints: true | |
local_rank: | |
logging_steps: 1 | |
xformers_attention: true | |
flash_attention: | |
gptq_groupsize: | |
gptq_model_v1: | |
warmup_steps: 10 | |
eval_steps: 5 | |
save_steps: 10 | |
debug: | |
deepspeed: | |
weight_decay: 0.000001 | |
fsdp: | |
fsdp_config: | |
special_tokens: | |
pad_token: "<|endoftext|>" | |
bos_token: ">>ABSTRACT<<" | |
eos_token: "<|endoftext|>" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment