Created
November 6, 2022 19:37
-
-
Save cosimo/a688cfc23935713b9d7467cad3660c29 to your computer and use it in GitHub Desktop.
Inference of Italian language BERT models via huggingface/Pytorch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Example of inference of masked LM models via Huggingface | |
and transformers/bert in Italian language | |
""" | |
import torch | |
from torch.nn import functional as F | |
from transformers import AutoModel, AutoModelWithLMHead, AutoTokenizer | |
#model_name = "dbmdz/electra-base-italian-xxl-cased-discriminator" | |
model_name = "dbmdz/bert-base-italian-xxl-uncased" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
#model = AutoModel.from_pretrained(model_name) | |
model = AutoModelWithLMHead.from_pretrained(model_name) | |
text = "il pianeta rosso si chiama " + tokenizer.mask_token | |
embeddings = tokenizer.encode_plus(text, add_special_tokens=True, truncation=True, padding="max_length", return_attention_mask=True, return_tensors = "pt") | |
mask_index = torch.where(embeddings["input_ids"][0] == tokenizer.mask_token_id) | |
print(embeddings) | |
predictions = model(**embeddings) | |
print(predictions) | |
logits = predictions.logits | |
softmax = F.softmax(logits, dim=-1) | |
mask_word = softmax[0, mask_index, :] | |
top_10 = torch.topk(mask_word, 10, dim=1)[1][0] | |
for token in top_10: | |
word = tokenizer.decode([token]) | |
new_sentence = text.replace(tokenizer.mask_token, word) | |
print(new_sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment