Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active April 16, 2024 22:32
Show Gist options
  • Save pszemraj/e8292b0b8585e286962743b4777e8c5c to your computer and use it in GitHub Desktop.
Save pszemraj/e8292b0b8585e286962743b4777e8c5c to your computer and use it in GitHub Desktop.
bash script for basic testing with pile-t5-large. note that this uses 1024 as the seq length for in/ 512 out
#!/bin/bash
# Set environment variables
export WANDB_PROJECT="pileT5-summ"
export WANDB_WATCH="gradients"
export WANDB_ENTITY="pszemraj"
NUM_WORKERS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
echo "Number of CPU cores: $NUM_WORKERS"
# Set model ID
# Set variables
MODEL_NAME_OR_PATH="pszemraj/pile-t5-large-fixed"
MODEL_NAME=$(basename "$MODEL_NAME_OR_PATH")
DATASET_NAME="samsum"
DS_BASENAME=$(basename "$DATASET_NAME")
TEXT_COLUMN="dialogue"
SUMMARY_COLUMN="summary"
DATA_SEED=16919
GENERATION_MAX_LENGTH=512
GRADIENT_ACCUMULATION_STEPS=16
HUB_MODEL_ID="BEE-spoke-data/$MODEL_NAME-$DS_BASENAME"
LEARNING_RATE=1e-4
OUTPUT_DIR="./runtime/$MODEL_NAME-$DS_BASENAME"
LOGGING_DIR="./runtime/$MODEL_NAME-$DS_BASENAME/logs"
LOGGING_STEPS=3
LR_SCHEDULER_TYPE="cosine"
MAX_EVAL_SAMPLES=300
MAX_GRAD_NORM=1.0
MAX_SOURCE_LENGTH=1024
MAX_TARGET_LENGTH=512
METRIC_FOR_BEST_MODEL="rouge2"
NUM_BEAMS=1
NUM_TRAIN_EPOCHS=5
OPTIM="adamw_torch_fused"
PER_DEVICE_EVAL_BATCH_SIZE=16
PER_DEVICE_TRAIN_BATCH_SIZE=8
RUN_NAME="$MODEL_NAME-$DS_BASENAME-r2"
SAVE_STRATEGY="epoch"
SEED=17868
VAL_MAX_TARGET_LENGTH=512
WARMUP_RATIO=0.05
WEIGHT_DECAY=0.01
# Run the command
python ./run_summarization.py \
--model_name_or_path "$MODEL_NAME_OR_PATH" \
--do_train \
--do_eval \
--do_predict \
--evaluation_strategy epoch \
--dataset_name "$DATASET_NAME" \
--text_column "$TEXT_COLUMN" \
--summary_column "$SUMMARY_COLUMN" \
--bf16 \
--bf16_full_eval False \
--data_seed "$DATA_SEED" \
--dataloader_num_workers "$NUM_WORKERS" \
--preprocessing_num_workers "$NUM_WORKERS" \
--generation_max_length "$GENERATION_MAX_LENGTH" \
--gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" \
--gradient_checkpointing True \
--hub_model_id "$HUB_MODEL_ID" \
--hub_private_repo True \
--hub_strategy every_save \
--learning_rate "$LEARNING_RATE" \
--load_best_model_at_end True \
--logging_dir "$LOGGING_DIR" \
--logging_steps "$LOGGING_STEPS" \
--lr_scheduler_type "$LR_SCHEDULER_TYPE" \
--max_eval_samples "$MAX_EVAL_SAMPLES" \
--max_grad_norm "$MAX_GRAD_NORM" \
--max_source_length "$MAX_SOURCE_LENGTH" \
--max_target_length "$MAX_TARGET_LENGTH" \
--metric_for_best_model "$METRIC_FOR_BEST_MODEL" \
--num_beams "$NUM_BEAMS" \
--num_train_epochs "$NUM_TRAIN_EPOCHS" \
--optim "$OPTIM" \
--output_dir "$OUTPUT_DIR" \
--overwrite_output_dir True \
--pad_to_max_length False \
--per_device_eval_batch_size "$PER_DEVICE_EVAL_BATCH_SIZE" \
--per_device_train_batch_size "$PER_DEVICE_TRAIN_BATCH_SIZE" \
--predict_with_generate True \
--push_to_hub \
--report_to wandb \
--run_name "$RUN_NAME" \
--save_strategy "$SAVE_STRATEGY" \
--seed "$SEED" \
--sortish_sampler True \
--tf32 True \
--val_max_target_length "$VAL_MAX_TARGET_LENGTH" \
--warmup_ratio "$WARMUP_RATIO" \
--weight_decay "$WEIGHT_DECAY" \
--greater_is_better True \
--use_fast_tokenizer False
# Set environment variables
export WANDB_PROJECT="pileT5-summ"
export WANDB_WATCH="gradients"
NUM_WORKERS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
echo "Number of CPU cores: $NUM_WORKERS"
# Set model ID
MODEL_ID="google/t5-v1_1-large"
MODEL_NAME=$(basename "$MODEL_ID")
# Run the summarization script
python ./run_summarization.py \
--model_name_or_path $MODEL_ID \
--do_train \
--do_eval \
--do_predict \
--evaluation_strategy epoch \
--dataset_name samsum \
--text_column dialogue \
--summary_column summary \
--bf16 \
--bf16_full_eval False \
--data_seed 16919 \
--dataloader_num_workers $NUM_WORKERS \
--preprocessing_num_workers $NUM_WORKERS \
--generation_max_length 512 \
--gradient_accumulation_steps 16 \
--gradient_checkpointing True \
--hub_model_id BEE-spoke-data/${MODEL_NAME}-samsum \
--hub_private_repo True \
--hub_strategy "every_save" \
--learning_rate 1e-4 \
--load_best_model_at_end True \
--logging_dir ./runtime/$MODEL_NAME-samsum-r1/logs \
--logging_steps 3 \
--lr_scheduler_type cosine \
--max_eval_samples 300 \
--max_grad_norm 1.0 \
--max_source_length 1024 \
--max_target_length 512 \
--metric_for_best_model rouge2 \
--num_beams 1 \
--num_train_epochs 5 \
--optim adamw_torch \
--output_dir ./runtime/$MODEL_NAME-samsum-r1 \
--overwrite_output_dir True \
--pad_to_max_length False \
--per_device_eval_batch_size 16 \
--per_device_train_batch_size 8 \
--predict_with_generate True \
--push_to_hub \
--report_to wandb \
--run_name $MODEL_NAME-samsum-r1 \
--save_strategy epoch \
--seed 17868 \
--sortish_sampler True \
--tf32 True \
--torch_compile_backend "inductor" \
--val_max_target_length 512 \
--warmup_ratio 0.05 \
--weight_decay 0.01 \
--greater_is_better True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment