Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Last active August 7, 2023 01:25
Show Gist options
  • Save w32zhong/9bf98e3d9aa9b32854d1e389cdf3d666 to your computer and use it in GitHub Desktop.
Save w32zhong/9bf98e3d9aa9b32854d1e389cdf3d666 to your computer and use it in GitHub Desktop.
conda create --name llama -c conda-forge python=3.8
conda activate llama
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
python -c 'import torch; print(torch.cuda.is_available())'
conda install -c conda-forge gxx_linux-64=10.4.0
conda install cuda -c nvidia/label/cuda-11.8.0
#pip install packaging flash-attn
#conda install -c conda-forge cudatoolkit-dev
pip install transformers=4.28.1
if false; then
conda install -c conda-forge tokenizers=0.13.3
pip install accelerate==0.18.0
pip install sentencepiece==0.1.98
pip install protobuf==3.20
git clone --depth 1 --branch v4.29.2 git@github.com:huggingface/transformers.git
cd transformers/src/
export PYTHONPATH=`pwd`
python transformers/models/llama/convert_llama_weights_to_hf.py -h
conda install -c conda-forge aria2
aria2c --file-allocation=none 'magnet:?xt=urn:btih:b8287ebfa04f879b048d4d4404108cf3e8014352&dn=LLaMA&tr=udp%3a%2f%2ftracker.opentrackr.org%3a1337%2fannounce'
python transformers/models/llama/convert_llama_weights_to_hf.py --input_dir /store2/scratch/w32zhong/llama/ --model_size 30B --output_dir /store2/scratch/w32zhong/llama/30B-hgf/
fi
pip install fschat einops
B=1
N=4
export CUDA_VISIBLE_DEVICES=0,1,3,4
rm -rf output
torchrun --nproc_per_node=$N --master_port=20001 fastchat/train/train_mem.py \
--model_name_or_path ../7B-hgf/ \
--data_path playground/data/dummy.json \
--bf16 True \
--output_dir output \
--num_train_epochs 3 \
--per_device_train_batch_size $B \
--per_device_eval_batch_size $B \
--gradient_accumulation_steps 16 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1200 \
--save_total_limit 10 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--fsdp "full_shard auto_wrap" \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--model_max_length 2048 \
--gradient_checkpointing True \
--lazy_preprocess True \
--tf32 True \
pip install deepspeed
pip install git+https://github.com/huggingface/peft
#deepspeed train_lora.py --deepspeed <$PATH_TO_DEEPSPEED_CONFIG>
#exit
import torch
from transformers import LlamaForCausalLM
from transformers import LlamaTokenizerFast
from peft import LoraConfig, get_peft_model
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:,}"
)
tokenizer = LlamaTokenizerFast.from_pretrained('7B-hgf')
model = LlamaForCausalLM.from_pretrained('7B-hgf')
TARGET_MODULES = [
"q_proj",
"v_proj",
]
lora_config = LoraConfig(
task_type="CAUSAL_LM",
r=8, lora_dropout=0.05,
lora_alpha=16, bias='none',
target_modules=TARGET_MODULES,
)
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)
with torch.backends.cuda.sdp_kernel(
enable_flash=True,
enable_math=False,
enable_mem_efficient=False
):
print(torch.backends.cuda.flash_sdp_enabled())
print(torch.backends.cuda.mem_efficient_sdp_enabled())
print(torch.backends.cuda.math_sdp_enabled())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment