Last active
April 16, 2024 22:32
-
-
Save pszemraj/e8292b0b8585e286962743b4777e8c5c to your computer and use it in GitHub Desktop.
bash script for basic testing with pile-t5-large. note that this uses 1024 as the seq length for in/ 512 out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Set environment variables | |
export WANDB_PROJECT="pileT5-summ" | |
export WANDB_WATCH="gradients" | |
export WANDB_ENTITY="pszemraj" | |
NUM_WORKERS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l) | |
echo "Number of CPU cores: $NUM_WORKERS" | |
# Set model ID | |
# Set variables | |
MODEL_NAME_OR_PATH="pszemraj/pile-t5-large-fixed" | |
MODEL_NAME=$(basename "$MODEL_NAME_OR_PATH") | |
DATASET_NAME="samsum" | |
DS_BASENAME=$(basename "$DATASET_NAME") | |
TEXT_COLUMN="dialogue" | |
SUMMARY_COLUMN="summary" | |
DATA_SEED=16919 | |
GENERATION_MAX_LENGTH=512 | |
GRADIENT_ACCUMULATION_STEPS=16 | |
HUB_MODEL_ID="BEE-spoke-data/$MODEL_NAME-$DS_BASENAME" | |
LEARNING_RATE=1e-4 | |
OUTPUT_DIR="./runtime/$MODEL_NAME-$DS_BASENAME" | |
LOGGING_DIR="./runtime/$MODEL_NAME-$DS_BASENAME/logs" | |
LOGGING_STEPS=3 | |
LR_SCHEDULER_TYPE="cosine" | |
MAX_EVAL_SAMPLES=300 | |
MAX_GRAD_NORM=1.0 | |
MAX_SOURCE_LENGTH=1024 | |
MAX_TARGET_LENGTH=512 | |
METRIC_FOR_BEST_MODEL="rouge2" | |
NUM_BEAMS=1 | |
NUM_TRAIN_EPOCHS=5 | |
OPTIM="adamw_torch_fused" | |
PER_DEVICE_EVAL_BATCH_SIZE=16 | |
PER_DEVICE_TRAIN_BATCH_SIZE=8 | |
RUN_NAME="$MODEL_NAME-$DS_BASENAME-r2" | |
SAVE_STRATEGY="epoch" | |
SEED=17868 | |
VAL_MAX_TARGET_LENGTH=512 | |
WARMUP_RATIO=0.05 | |
WEIGHT_DECAY=0.01 | |
# Run the command | |
python ./run_summarization.py \ | |
--model_name_or_path "$MODEL_NAME_OR_PATH" \ | |
--do_train \ | |
--do_eval \ | |
--do_predict \ | |
--evaluation_strategy epoch \ | |
--dataset_name "$DATASET_NAME" \ | |
--text_column "$TEXT_COLUMN" \ | |
--summary_column "$SUMMARY_COLUMN" \ | |
--bf16 \ | |
--bf16_full_eval False \ | |
--data_seed "$DATA_SEED" \ | |
--dataloader_num_workers "$NUM_WORKERS" \ | |
--preprocessing_num_workers "$NUM_WORKERS" \ | |
--generation_max_length "$GENERATION_MAX_LENGTH" \ | |
--gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" \ | |
--gradient_checkpointing True \ | |
--hub_model_id "$HUB_MODEL_ID" \ | |
--hub_private_repo True \ | |
--hub_strategy every_save \ | |
--learning_rate "$LEARNING_RATE" \ | |
--load_best_model_at_end True \ | |
--logging_dir "$LOGGING_DIR" \ | |
--logging_steps "$LOGGING_STEPS" \ | |
--lr_scheduler_type "$LR_SCHEDULER_TYPE" \ | |
--max_eval_samples "$MAX_EVAL_SAMPLES" \ | |
--max_grad_norm "$MAX_GRAD_NORM" \ | |
--max_source_length "$MAX_SOURCE_LENGTH" \ | |
--max_target_length "$MAX_TARGET_LENGTH" \ | |
--metric_for_best_model "$METRIC_FOR_BEST_MODEL" \ | |
--num_beams "$NUM_BEAMS" \ | |
--num_train_epochs "$NUM_TRAIN_EPOCHS" \ | |
--optim "$OPTIM" \ | |
--output_dir "$OUTPUT_DIR" \ | |
--overwrite_output_dir True \ | |
--pad_to_max_length False \ | |
--per_device_eval_batch_size "$PER_DEVICE_EVAL_BATCH_SIZE" \ | |
--per_device_train_batch_size "$PER_DEVICE_TRAIN_BATCH_SIZE" \ | |
--predict_with_generate True \ | |
--push_to_hub \ | |
--report_to wandb \ | |
--run_name "$RUN_NAME" \ | |
--save_strategy "$SAVE_STRATEGY" \ | |
--seed "$SEED" \ | |
--sortish_sampler True \ | |
--tf32 True \ | |
--val_max_target_length "$VAL_MAX_TARGET_LENGTH" \ | |
--warmup_ratio "$WARMUP_RATIO" \ | |
--weight_decay "$WEIGHT_DECAY" \ | |
--greater_is_better True \ | |
--use_fast_tokenizer False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Set environment variables | |
export WANDB_PROJECT="pileT5-summ" | |
export WANDB_WATCH="gradients" | |
NUM_WORKERS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l) | |
echo "Number of CPU cores: $NUM_WORKERS" | |
# Set model ID | |
MODEL_ID="google/t5-v1_1-large" | |
MODEL_NAME=$(basename "$MODEL_ID") | |
# Run the summarization script | |
python ./run_summarization.py \ | |
--model_name_or_path $MODEL_ID \ | |
--do_train \ | |
--do_eval \ | |
--do_predict \ | |
--evaluation_strategy epoch \ | |
--dataset_name samsum \ | |
--text_column dialogue \ | |
--summary_column summary \ | |
--bf16 \ | |
--bf16_full_eval False \ | |
--data_seed 16919 \ | |
--dataloader_num_workers $NUM_WORKERS \ | |
--preprocessing_num_workers $NUM_WORKERS \ | |
--generation_max_length 512 \ | |
--gradient_accumulation_steps 16 \ | |
--gradient_checkpointing True \ | |
--hub_model_id BEE-spoke-data/${MODEL_NAME}-samsum \ | |
--hub_private_repo True \ | |
--hub_strategy "every_save" \ | |
--learning_rate 1e-4 \ | |
--load_best_model_at_end True \ | |
--logging_dir ./runtime/$MODEL_NAME-samsum-r1/logs \ | |
--logging_steps 3 \ | |
--lr_scheduler_type cosine \ | |
--max_eval_samples 300 \ | |
--max_grad_norm 1.0 \ | |
--max_source_length 1024 \ | |
--max_target_length 512 \ | |
--metric_for_best_model rouge2 \ | |
--num_beams 1 \ | |
--num_train_epochs 5 \ | |
--optim adamw_torch \ | |
--output_dir ./runtime/$MODEL_NAME-samsum-r1 \ | |
--overwrite_output_dir True \ | |
--pad_to_max_length False \ | |
--per_device_eval_batch_size 16 \ | |
--per_device_train_batch_size 8 \ | |
--predict_with_generate True \ | |
--push_to_hub \ | |
--report_to wandb \ | |
--run_name $MODEL_NAME-samsum-r1 \ | |
--save_strategy epoch \ | |
--seed 17868 \ | |
--sortish_sampler True \ | |
--tf32 True \ | |
--torch_compile_backend "inductor" \ | |
--val_max_target_length 512 \ | |
--warmup_ratio 0.05 \ | |
--weight_decay 0.01 \ | |
--greater_is_better True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment