Plugins:
- Time remaining 1508357010 https://ankiweb.net/shared/info/1508357010
- Customize keyboard shortcuts 24411424 https://ankiweb.net/shared/info/24411424
- I set editor cloze to Ctrl-shift-m (ctrl =cmd on mac) m stands for "mask"
Plugins:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import time | |
from tqdm import tqdm | |
from pathlib import Path | |
import pandas as pd | |
models = ['sshleifer/distilbart-cnn-12-3', | |
'sshleifer/distilbart-cnn-12-6', | |
'sshleifer/distilbart-cnn-6-6', |
Avoid:
[!h]
for figures/tables.NameError
introducing terms that haven't been defined.from pathlib import Path | |
import fire | |
from tqdm import tqdm | |
DS_TO_KEY = { | |
'gigaword': ('document', 'summary'), | |
'xsum': ('document', 'summary'), | |
'aeslc': ('email_body', 'subject_line'), |
from pathlib import Path | |
import fire | |
from tqdm import tqdm | |
DS_TO_KEY = { | |
'gigaword': ('document', 'summary'), | |
'xsum': ('document', 'summary'), | |
'aeslc': ('email_body', 'subject_line'), |
6 groups of models inherit from BartForConditionalGeneration
.
The major differences between them are:
This document focuses on layernorm timing.
export b="s3://models.huggingface.co/bert" | |
stas_to_fb () { | |
src=$1 | |
shift | |
aws s3 sync $b/stas/$src $b/facebook/$src $@ | |
} | |
stas_to_allenai () { | |
src=$1 | |
shift |
(4, 512)
(4, 12)
or (4, small_int)
which don't fully utilize the GPU.Dynamic Batch Size: try to organize batches to be 4*512=2048
tokens, one batch might be shaped (4,512)
another (32, 64)
.
python finetune.py \ | |
--task summarization \ | |
--learning_rate=3e-4 \ | |
--do_train \ | |
--do_predict \ | |
--val_check_interval 0.25 --n_val 1000 \ | |
--data_dir xsum \ | |
--max_source_length 512 --max_target_length=56 \ | |
--freeze_embeds \ | |
--model_name_or_path google/pegasus-large \ |
for file in ls */*bleu.json
do
echo "$file:"
cat "$file" | sed -n '/^\s*$/!{p;q}'
echo "------"
done
enro test bleu (distil-mbart unless otherwise specified, before post processing).