Skip to content

Instantly share code, notes, and snippets.

@alexcpn
Last active March 18, 2023 17:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexcpn/e33a8b44e9774653d7492fb494fb1009 to your computer and use it in GitHub Desktop.
Save alexcpn/e33a8b44e9774653d7492fb494fb1009 to your computer and use it in GitHub Desktop.
from transformers import T5Tokenizer, T5ForConditionalGeneration
import numpy as np
import torch
class FlaxDataCollatorForT5MLM:
"""
From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
"""
def __init__(self,tokenizer,noise_density,mean_noise_span_length) -> None:
self.tokenizer = tokenizer
self.noise_density = noise_density
self.mean_noise_span_length =mean_noise_span_length
def create_sentinel_ids(self, mask_indices):
"""
Sentinel ids creation given the indices that should be masked.
The start indices of each mask are replaced by the sentinel ids in increasing
order. Consecutive mask indices to be deleted are replaced with `-1`.
"""
start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
start_indices[:, 0] = mask_indices[:, 0]
sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
sentinel_ids -= mask_indices - start_indices
return sentinel_ids
def filter_input_ids(self, input_ids, sentinel_ids):
"""
Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
"""
batch_size = input_ids.shape[0]
input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
# input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
# masked tokens coming after sentinel tokens and should be removed
input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1))
input_ids = np.concatenate(
[input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
)
return input_ids
def random_spans_noise_mask(self, length):
"""This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
Noise mask consisting of random spans of noise tokens.
The number of noise tokens and the number of noise spans and non-noise spans
are determined deterministically as follows:
num_noise_tokens = round(length * noise_density)
num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
Spans alternate between non-noise and noise, beginning with non-noise.
Subject to the above restrictions, all masks are equally likely.
Args:
length: an int32 scalar (length of the incoming token sequence)
noise_density: a float - approximate density of output mask
mean_noise_span_length: a number
Returns:
a boolean tensor with shape [length]
"""
orig_length = length
num_noise_tokens = int(np.round(length * self.noise_density))
# avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
# avoid degeneracy by ensuring positive number of noise spans
num_noise_spans = max(num_noise_spans, 1)
num_nonnoise_tokens = length - num_noise_tokens
# pick the lengths of the noise spans and the non-noise spans
def _random_segmentation(num_items, num_segments):
"""Partition a sequence of items randomly into non-empty segments.
Args:
num_items: an integer scalar > 0
num_segments: an integer scalar in [1, num_items]
Returns:
a Tensor with shape [num_segments] containing positive integers that add
up to num_items
"""
mask_indices = np.arange(num_items - 1) < (num_segments - 1)
np.random.shuffle(mask_indices)
first_in_segment = np.pad(mask_indices, [[1, 0]])
segment_id = np.cumsum(first_in_segment)
# count length of sub segments assuming that list is sorted
_, segment_length = np.unique(segment_id, return_counts=True)
return segment_length
noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)
interleaved_span_lengths = np.reshape(
np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
)
span_starts = np.cumsum(interleaved_span_lengths)[:-1]
span_start_indicator = np.zeros((length,), dtype=np.int8)
span_start_indicator[span_starts] = True
span_num = np.cumsum(span_start_indicator)
is_noise = np.equal(span_num % 2, 1)
return is_noise[:orig_length]
def get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt):
encoded = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt")
batch_size =1
input_length = encoded.input_ids.shape[1]
denoiser = FlaxDataCollatorForT5MLM(tokenizer,.55,1.5)
mask_indices = np.asarray([denoiser.random_spans_noise_mask(input_length) for i in range(batch_size)])
labels_mask = ~mask_indices
input_ids_sentinel = denoiser.create_sentinel_ids(mask_indices.astype(np.int8))
labels_sentinel = denoiser.create_sentinel_ids(labels_mask.astype(np.int8))
input_ids = denoiser.filter_input_ids(encoded.input_ids, input_ids_sentinel)
labels = denoiser.filter_input_ids(encoded.input_ids, labels_sentinel)
return labels,input_ids
if __name__ == '__main__':
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
len_tokenizer =len(tokenizer) # 32100 to get the sentinel ids
print(f"len_tokenizer={len_tokenizer}")
# Unsupervised denoising training
# https://huggingface.co/docs/transformers/main/model_doc/t5#training
print("-"*20)
prompt = "The <extra_id_0> walks in <extra_id_1> park"
encoded_prompt = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt").input_ids
print(f"encoded_prompt ={encoded_prompt}")
labels ="<extra_id_0> cute dog <extra_id_1> the <extra_id_2>"
encoded_labels = tokenizer(labels, truncation=False, padding=False, return_tensors="pt").input_ids
print(f"encoded_labels ={encoded_labels}")
print(f"{encoded_prompt.shape} ={encoded_labels.shape}")
print("-"*20)
# simulating the above
prompt = "The cute dog walks in the green park"
labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
print(f"denoised input_ids decoded = {tokenizer.decode(*input_ids,skip_special_tokens=False)}")
print(f"denoised labels decoded = {tokenizer.decode(*labels,skip_special_tokens=False)}")
print(f"input_ids.shape {input_ids.shape} labels.shape {labels.shape}") # todo should this be equal
denoised_input_ids = torch.from_numpy(input_ids)
denoised_labels = torch.from_numpy(labels)
denoised_attention_mask = torch.ones(input_ids.shape)
model = T5ForConditionalGeneration.from_pretrained("t5-small")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
#-------------------------------------------------------------
# Before training -check model output
model.eval()
test_prompt = "The cute dog walks in the"
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=15)
#pad_token_id=tokenizer.eos_token_id, top_k=50)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Before Training:'{test_prompt}'-->'{test_answer}'")
#-------------------------------------------------------------
# Training - Method 1, input_id == label (target)
# with this the model generates the input_id/label as is after training - so this is not the way
# test_prompt = "The cute dog walks in the green park"
# encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
# outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
# labels=encoded.input_ids)
# Method 2 using the denoised way
# outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
# labels=denoised_labels)
# Method 3 - Giving specific target
test_prompt = "The cute dog walks in the"
label_prompt = "green park"
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
label = tokenizer(label_prompt, truncation=False, padding=False, return_tensors="pt")
model.train()
for epoch in range(100):
#Method 1 input_id == label (target)
# outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
# Method 2 - Using the denoised way
# outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
# labels=denoised_labels)
# Method 3 -Giving specific target
outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
labels=label.input_ids)
loss = outputs.loss
if epoch % 10 == 0:
print(f"Epoch {epoch} Loss {loss}")
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(f"Epoch {epoch} Loss {loss}")
#-------------------------------------------------------------
# After training
model.eval()
test_prompt = "The cute dog walks in the"
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=25)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"After Training:'{test_prompt}'-->'{test_answer}'")

Method 1 Using Target same as Input

input=The cute dog walks in the green park
target=The cute dog walks in the green park

Output

Before Training:'The cute dog walks in the'-->'cute chien y'
Epoch 0  Loss 1.329489827156067
Epoch 10  Loss 0.16120685636997223
Epoch 20  Loss 0.06035826727747917
Epoch 30  Loss 0.012238807044923306
Epoch 40  Loss 0.009122252464294434
Epoch 50  Loss 0.006840283051133156
Epoch 60  Loss 0.006290588993579149
Epoch 70  Loss 0.004880222026258707
Epoch 80  Loss 0.19709350168704987
Epoch 90  Loss 0.012492064386606216
Epoch 99  Loss 0.005622180178761482
After Training:'The cute dog walks in the green park'-->'The cute dog walks in the green park'

Method 2 Using Denoised Training

input=The<extra_id_0> dog walks<extra_id_1> park<extra_id_2></s>
target=<extra_id_0> cute<extra_id_1> in the green<extra_id_2></s></s>

Output

len_tokenizer=32100
--------------------
encoded_prompt =tensor([[   37, 32099, 10681,    16, 32098,  2447,     1]])
encoded_labels =tensor([[32099,  5295,  1782, 32098,     8, 32097,     1]])
torch.Size([1, 7]) =torch.Size([1, 7])
--------------------
denoised input_ids decoded = The<extra_id_0> dog walks<extra_id_1> park<extra_id_2></s>
denoised labels decoded   = <extra_id_0> cute<extra_id_1> in the green<extra_id_2></s></s>
input_ids.shape (1, 8) labels.shape (1, 9)
Before Training:'The cute dog walks in the'-->'Als'
Epoch 0  Loss 3.3623902797698975
Epoch 10  Loss 1.6039639711380005
Epoch 20  Loss 1.2029192447662354
Epoch 30  Loss 0.7536050081253052
Epoch 40  Loss 1.022627353668213
Epoch 50  Loss 0.11967131495475769
Epoch 60  Loss 0.08717009425163269
Epoch 70  Loss 0.08708580583333969
Epoch 80  Loss 0.027843305841088295
Epoch 90  Loss 0.05927355960011482
Epoch 99  Loss 0.06003263220191002
After Training:'The cute dog walks in the'-->'cute dog walks in the cute cute dog'

Method 3 Using Target as is

input = "The cute dog walks in the"
target = "green park"

Ouput

Before Training:'The cute dog walks in the'-->'Der cute dog walks in the the'
Epoch 0  Loss 9.206318855285645
Epoch 10  Loss 6.024537563323975
Epoch 20  Loss 3.063105821609497
Epoch 30  Loss 3.7331817150115967
Epoch 40  Loss 1.68136465549469
Epoch 50  Loss 0.3739849328994751
Epoch 60  Loss 0.13449843227863312
Epoch 70  Loss 0.09859928488731384
Epoch 80  Loss 0.4800107479095459
Epoch 90  Loss 0.10699515789747238
Epoch 99  Loss 0.02133219689130783
After Training:'The cute dog walks in the'-->'green park'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment