Skip to content

Instantly share code, notes, and snippets.

Last active May 19, 2022 03:51
Show Gist options
  • Save badjano/6dcc20ade47029ebb8f508282c37808c to your computer and use it in GitHub Desktop.
Save badjano/6dcc20ade47029ebb8f508282c37808c to your computer and use it in GitHub Desktop.
a T5 model test in portuguese
import os
import random
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
model_sizes = ["small", "base", "large"]
model_name = f'unicamp-dl/ptt5-{model_sizes[1]}-portuguese-vocab'
tokenizer = T5Tokenizer.from_pretrained(model_name)
print(f'Tokenizer from {model_name} loaded.')
model = TFT5ForConditionalGeneration.from_pretrained(model_name)
print(f'Model from {model_name} loaded.')
text = "Os conflitos <extra_id_0> ocorridos na Alemanha e solucionados em 25 de setembro de 1555 com a <extra_id_1> da Paz de Augsburgo inauguraram um período no qual cada <extra_id_2> podia impor sua crença aos habitantes de seus domínios."
sentences = [a for a in text.split(".") if a]
sentence = random.choice(sentences)
input_ids = tokenizer([sentence], return_tensors="tf").input_ids
outputs = model.generate(input_ids, max_length=150)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"input: {sentence}\noutput: {answer}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment