cosimo/huggingface-bert-inference.py

## huggingface-bert-inference.py
"""
Example of inference of masked LM models via Huggingface
and transformers/bert in Italian language
"""

import torch
from torch.nn import functional as F
from transformers import AutoModel, AutoModelWithLMHead, AutoTokenizer

#model_name = "dbmdz/electra-base-italian-xxl-cased-discriminator"
model_name = "dbmdz/bert-base-italian-xxl-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

#model = AutoModel.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)

text = "il pianeta rosso si chiama " + tokenizer.mask_token
embeddings = tokenizer.encode_plus(text, add_special_tokens=True, truncation=True, padding="max_length", return_attention_mask=True, return_tensors = "pt")
mask_index = torch.where(embeddings["input_ids"][0] == tokenizer.mask_token_id)
print(embeddings)

predictions = model(**embeddings)
print(predictions)

logits = predictions.logits
softmax = F.softmax(logits, dim=-1)
mask_word = softmax[0, mask_index, :]

top_10 = torch.topk(mask_word, 10, dim=1)[1][0]

for token in top_10:
   word = tokenizer.decode([token])
   new_sentence = text.replace(tokenizer.mask_token, word)
   print(new_sentence)
	"""
	Example of inference of masked LM models via Huggingface
	and transformers/bert in Italian language
	"""

	import torch
	from torch.nn import functional as F
	from transformers import AutoModel, AutoModelWithLMHead, AutoTokenizer

	#model_name = "dbmdz/electra-base-italian-xxl-cased-discriminator"
	model_name = "dbmdz/bert-base-italian-xxl-uncased"

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	#model = AutoModel.from_pretrained(model_name)
	model = AutoModelWithLMHead.from_pretrained(model_name)

	text = "il pianeta rosso si chiama " + tokenizer.mask_token
	embeddings = tokenizer.encode_plus(text, add_special_tokens=True, truncation=True, padding="max_length", return_attention_mask=True, return_tensors = "pt")
	mask_index = torch.where(embeddings["input_ids"][0] == tokenizer.mask_token_id)
	print(embeddings)

	predictions = model(**embeddings)
	print(predictions)

	logits = predictions.logits
	softmax = F.softmax(logits, dim=-1)
	mask_word = softmax[0, mask_index, :]

	top_10 = torch.topk(mask_word, 10, dim=1)[1][0]

	for token in top_10:
	word = tokenizer.decode([token])
	new_sentence = text.replace(tokenizer.mask_token, word)
	print(new_sentence)