Created November 23, 2023 23:27
# 🔸 reference (BERT)
<-- use this for fairness, similar size to our model
"/Users/pi/code/m2/fff/.venv/lib/python3.11/site-packages/tqdm/ TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"config.json: 100%|██████████| 625/625 [00:00<00:00, 1.84MB/s]\n",
model.safetensors: 100%|██████████| 714M/714M [01:28<00:00, 8.07MB/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 115kB/s]
vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 2.76MB/s]
tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 10.9MB/s]
"from transformers import pipeline\n",
"# unmasker = pipeline('fill-mask', model='bert-base-uncased') # 110M params\n",
"unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased') # 179M params, 714MB\n"
62.6 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit unmasker("The capital of France is [MASK]!")
# 🔸 Our model
(189M params)
import cramming
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"import torch\n",
"tokenizer = AutoTokenizer.from_pretrained(\"pbelcak/UltraFastBERT-1x11-long\")\n",
"model = AutoModelForMaskedLM.from_pretrained(\"pbelcak/UltraFastBERT-1x11-long\")\n"
"text = \"The capital of France is <mask>!\"\n",
"# text = \"The cat sat on <mask> mat.\"\n",
"encoded_input = tokenizer(text, return_tensors='pt')"
77.6 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit output = model(**encoded_input)
Text: The capital of France is <mask>!
Most likely token prediction: ['paris france french here germany']
"output = model(**encoded_input)\n",
"mask_token_index = (encoded_input.input_ids[0] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]\n",
"logits = output['outputs']\n",
"top5_indices = torch.topk(logits[mask_token_index], k=5).indices\n",
"token_winners = [tokenizer.decode(u) for u in top5_indices]\n",
"print('Text:', text)\n",
"print(f'Most likely token prediction: {token_winners}')"
