alexcpn/t5_denoising_2.py

## t5_denoising_2.py
from transformers import T5Tokenizer, T5ForConditionalGeneration
import numpy as np
import torch

class FlaxDataCollatorForT5MLM:
    """
    From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
    """
    def __init__(self,tokenizer,noise_density,mean_noise_span_length) -> None:
        self.tokenizer = tokenizer
        self.noise_density = noise_density
        self.mean_noise_span_length =mean_noise_span_length

    def create_sentinel_ids(self, mask_indices):
        """
        Sentinel ids creation given the indices that should be masked.
        The start indices of each mask are replaced by the sentinel ids in increasing
        order. Consecutive mask indices to be deleted are replaced with `-1`.
        """
        start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
        start_indices[:, 0] = mask_indices[:, 0]

        sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
        sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
        sentinel_ids -= mask_indices - start_indices

        return sentinel_ids

    def filter_input_ids(self, input_ids, sentinel_ids):
        """
        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
        """
        batch_size = input_ids.shape[0]

        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
        # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
        # masked tokens coming after sentinel tokens and should be removed
        input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1))
        input_ids = np.concatenate(
            [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
        )
        return input_ids

    def random_spans_noise_mask(self, length):
        """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
        Noise mask consisting of random spans of noise tokens.
        The number of noise tokens and the number of noise spans and non-noise spans
        are determined deterministically as follows:
        num_noise_tokens = round(length * noise_density)
        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
        Spans alternate between non-noise and noise, beginning with non-noise.
        Subject to the above restrictions, all masks are equally likely.
        Args:
            length: an int32 scalar (length of the incoming token sequence)
            noise_density: a float - approximate density of output mask
            mean_noise_span_length: a number
        Returns:
            a boolean tensor with shape [length]
        """

        orig_length = length

        num_noise_tokens = int(np.round(length * self.noise_density))
        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))

        # avoid degeneracy by ensuring positive number of noise spans
        num_noise_spans = max(num_noise_spans, 1)
        num_nonnoise_tokens = length - num_noise_tokens

        # pick the lengths of the noise spans and the non-noise spans
        def _random_segmentation(num_items, num_segments):
            """Partition a sequence of items randomly into non-empty segments.
            Args:
                num_items: an integer scalar > 0
                num_segments: an integer scalar in [1, num_items]
            Returns:
                a Tensor with shape [num_segments] containing positive integers that add
                up to num_items
            """
            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
            np.random.shuffle(mask_indices)
            first_in_segment = np.pad(mask_indices, [[1, 0]])
            segment_id = np.cumsum(first_in_segment)
            # count length of sub segments assuming that list is sorted
            _, segment_length = np.unique(segment_id, return_counts=True)
            return segment_length

        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
        nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)

        interleaved_span_lengths = np.reshape(
            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
        )
        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
        span_start_indicator = np.zeros((length,), dtype=np.int8)
        span_start_indicator[span_starts] = True
        span_num = np.cumsum(span_start_indicator)
        is_noise = np.equal(span_num % 2, 1)

        return is_noise[:orig_length]


def get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt):
    encoded = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt")
    batch_size =1
    input_length = encoded.input_ids.shape[1]
    denoiser = FlaxDataCollatorForT5MLM(tokenizer,.55,1.5)
    mask_indices = np.asarray([denoiser.random_spans_noise_mask(input_length) for i in range(batch_size)])
    labels_mask = ~mask_indices
    input_ids_sentinel = denoiser.create_sentinel_ids(mask_indices.astype(np.int8))
    labels_sentinel = denoiser.create_sentinel_ids(labels_mask.astype(np.int8))
    input_ids = denoiser.filter_input_ids(encoded.input_ids, input_ids_sentinel)
    labels  =  denoiser.filter_input_ids(encoded.input_ids, labels_sentinel)
    return labels,input_ids

if __name__ == '__main__':

    model_name = 't5-base'
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    len_tokenizer =len(tokenizer) # 32100 to get the sentinel ids
    print(f"len_tokenizer={len_tokenizer}")
    # Unsupervised denoising training
    # https://huggingface.co/docs/transformers/main/model_doc/t5#training

    print("-"*20)
    prompt = "The <extra_id_0> walks in <extra_id_1> park"
    encoded_prompt = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt").input_ids
    print(f"encoded_prompt ={encoded_prompt}")
    labels ="<extra_id_0> cute dog <extra_id_1> the <extra_id_2>"
    encoded_labels = tokenizer(labels, truncation=False, padding=False, return_tensors="pt").input_ids
    print(f"encoded_labels ={encoded_labels}")
    print(f"{encoded_prompt.shape} ={encoded_labels.shape}")
    print("-"*20)
    # simulating the above
    prompt = "The cute dog walks in the green park"
    labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
    print(f"denoised input_ids decoded = {tokenizer.decode(*input_ids,skip_special_tokens=False)}")
    print(f"denoised labels decoded   = {tokenizer.decode(*labels,skip_special_tokens=False)}")
    print(f"input_ids.shape {input_ids.shape} labels.shape {labels.shape}") # todo should this be equal
    denoised_input_ids = torch.from_numpy(input_ids)
    denoised_labels = torch.from_numpy(labels)
    denoised_attention_mask = torch.ones(input_ids.shape)

    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    #-------------------------------------------------------------
    # Before training -check model output
    model.eval()
    test_prompt = "The cute dog walks in the"
    encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
    test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=15)
                         #pad_token_id=tokenizer.eos_token_id, top_k=50)
    test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
    print(f"Before Training:'{test_prompt}'-->'{test_answer}'")
    #-------------------------------------------------------------
    # Training - Method 1, input_id == label (target)
    # with this the model generates the input_id/label as is  after training - so this is not the way
    # test_prompt = "The cute dog walks in the green park"
    # encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
    # outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
    #                     labels=encoded.input_ids)
    #  Method 2 using the denoised way
    # outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
    #                     labels=denoised_labels)
    # Method 3 - Giving specific target
    test_prompt = "The cute dog walks in the"
    label_prompt = "green park"
    encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
    label = tokenizer(label_prompt, truncation=False, padding=False, return_tensors="pt")
    model.train()
    for epoch in range(100):
        #Method 1 input_id == label (target)
        # outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
        # Method 2 - Using the denoised way
        # outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
        #                labels=denoised_labels)
        # Method 3 -Giving specific target
        outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
                        labels=label.input_ids)
        loss = outputs.loss
        if epoch % 10 == 0:
            print(f"Epoch {epoch}  Loss {loss}")
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch}  Loss {loss}")
    #-------------------------------------------------------------
    # After  training
    model.eval()
    test_prompt = "The cute dog walks in the"
    encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
    test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=25)
    test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
    print(f"After Training:'{test_prompt}'-->'{test_answer}'")

## t5_training_ouput.md

      
    Raw
  

              t5_training_ouput.md
            
          
    Method 1 Using Target same as Input

input=The cute dog walks in the green park
target=The cute dog walks in the green park

Output
Before Training:'The cute dog walks in the'-->'cute chien y'
Epoch 0  Loss 1.329489827156067
Epoch 10  Loss 0.16120685636997223
Epoch 20  Loss 0.06035826727747917
Epoch 30  Loss 0.012238807044923306
Epoch 40  Loss 0.009122252464294434
Epoch 50  Loss 0.006840283051133156
Epoch 60  Loss 0.006290588993579149
Epoch 70  Loss 0.004880222026258707
Epoch 80  Loss 0.19709350168704987
Epoch 90  Loss 0.012492064386606216
Epoch 99  Loss 0.005622180178761482
After Training:'The cute dog walks in the green park'-->'The cute dog walks in the green park'

Method 2 Using Denoised Training

input=The<extra_id_0> dog walks<extra_id_1> park<extra_id_2></s>
target=<extra_id_0> cute<extra_id_1> in the green<extra_id_2></s></s>

Output
len_tokenizer=32100
--------------------
encoded_prompt =tensor([[   37, 32099, 10681,    16, 32098,  2447,     1]])
encoded_labels =tensor([[32099,  5295,  1782, 32098,     8, 32097,     1]])
torch.Size([1, 7]) =torch.Size([1, 7])
--------------------
denoised input_ids decoded = The<extra_id_0> dog walks<extra_id_1> park<extra_id_2></s>
denoised labels decoded   = <extra_id_0> cute<extra_id_1> in the green<extra_id_2></s></s>
input_ids.shape (1, 8) labels.shape (1, 9)
Before Training:'The cute dog walks in the'-->'Als'
Epoch 0  Loss 3.3623902797698975
Epoch 10  Loss 1.6039639711380005
Epoch 20  Loss 1.2029192447662354
Epoch 30  Loss 0.7536050081253052
Epoch 40  Loss 1.022627353668213
Epoch 50  Loss 0.11967131495475769
Epoch 60  Loss 0.08717009425163269
Epoch 70  Loss 0.08708580583333969
Epoch 80  Loss 0.027843305841088295
Epoch 90  Loss 0.05927355960011482
Epoch 99  Loss 0.06003263220191002
After Training:'The cute dog walks in the'-->'cute dog walks in the cute cute dog'

Method 3 Using Target as is

input = "The cute dog walks in the"
target = "green park"

Ouput
Before Training:'The cute dog walks in the'-->'Der cute dog walks in the the'
Epoch 0  Loss 9.206318855285645
Epoch 10  Loss 6.024537563323975
Epoch 20  Loss 3.063105821609497
Epoch 30  Loss 3.7331817150115967
Epoch 40  Loss 1.68136465549469
Epoch 50  Loss 0.3739849328994751
Epoch 60  Loss 0.13449843227863312
Epoch 70  Loss 0.09859928488731384
Epoch 80  Loss 0.4800107479095459
Epoch 90  Loss 0.10699515789747238
Epoch 99  Loss 0.02133219689130783
After Training:'The cute dog walks in the'-->'green park'
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import numpy as np
	import torch

	class FlaxDataCollatorForT5MLM:
	"""
	From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
	"""
	def __init__(self,tokenizer,noise_density,mean_noise_span_length) -> None:
	self.tokenizer = tokenizer
	self.noise_density = noise_density
	self.mean_noise_span_length =mean_noise_span_length

	def create_sentinel_ids(self, mask_indices):
	"""
	Sentinel ids creation given the indices that should be masked.
	The start indices of each mask are replaced by the sentinel ids in increasing
	order. Consecutive mask indices to be deleted are replaced with `-1`.
	"""
	start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
	start_indices[:, 0] = mask_indices[:, 0]

	sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
	sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
	sentinel_ids -= mask_indices - start_indices

	return sentinel_ids

	def filter_input_ids(self, input_ids, sentinel_ids):
	"""
	Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
	This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
	"""
	batch_size = input_ids.shape[0]

	input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
	# input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
	# masked tokens coming after sentinel tokens and should be removed
	input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1))
	input_ids = np.concatenate(
	[input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
	)
	return input_ids

	def random_spans_noise_mask(self, length):
	"""This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
	Noise mask consisting of random spans of noise tokens.
	The number of noise tokens and the number of noise spans and non-noise spans
	are determined deterministically as follows:
	num_noise_tokens = round(length * noise_density)
	num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
	Spans alternate between non-noise and noise, beginning with non-noise.
	Subject to the above restrictions, all masks are equally likely.
	Args:
	length: an int32 scalar (length of the incoming token sequence)
	noise_density: a float - approximate density of output mask
	mean_noise_span_length: a number
	Returns:
	a boolean tensor with shape [length]
	"""

	orig_length = length

	num_noise_tokens = int(np.round(length * self.noise_density))
	# avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
	num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
	num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))

	# avoid degeneracy by ensuring positive number of noise spans
	num_noise_spans = max(num_noise_spans, 1)
	num_nonnoise_tokens = length - num_noise_tokens

	# pick the lengths of the noise spans and the non-noise spans
	def _random_segmentation(num_items, num_segments):
	"""Partition a sequence of items randomly into non-empty segments.
	Args:
	num_items: an integer scalar > 0
	num_segments: an integer scalar in [1, num_items]
	Returns:
	a Tensor with shape [num_segments] containing positive integers that add
	up to num_items
	"""
	mask_indices = np.arange(num_items - 1) < (num_segments - 1)
	np.random.shuffle(mask_indices)
	first_in_segment = np.pad(mask_indices, [[1, 0]])
	segment_id = np.cumsum(first_in_segment)
	# count length of sub segments assuming that list is sorted
	_, segment_length = np.unique(segment_id, return_counts=True)
	return segment_length

	noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
	nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)

	interleaved_span_lengths = np.reshape(
	np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
	)
	span_starts = np.cumsum(interleaved_span_lengths)[:-1]
	span_start_indicator = np.zeros((length,), dtype=np.int8)
	span_start_indicator[span_starts] = True
	span_num = np.cumsum(span_start_indicator)
	is_noise = np.equal(span_num % 2, 1)

	return is_noise[:orig_length]


	def get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt):
	encoded = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt")
	batch_size =1
	input_length = encoded.input_ids.shape[1]
	denoiser = FlaxDataCollatorForT5MLM(tokenizer,.55,1.5)
	mask_indices = np.asarray([denoiser.random_spans_noise_mask(input_length) for i in range(batch_size)])
	labels_mask = ~mask_indices
	input_ids_sentinel = denoiser.create_sentinel_ids(mask_indices.astype(np.int8))
	labels_sentinel = denoiser.create_sentinel_ids(labels_mask.astype(np.int8))
	input_ids = denoiser.filter_input_ids(encoded.input_ids, input_ids_sentinel)
	labels = denoiser.filter_input_ids(encoded.input_ids, labels_sentinel)
	return labels,input_ids

	if __name__ == '__main__':

	model_name = 't5-base'
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	len_tokenizer =len(tokenizer) # 32100 to get the sentinel ids
	print(f"len_tokenizer={len_tokenizer}")
	# Unsupervised denoising training
	# https://huggingface.co/docs/transformers/main/model_doc/t5#training

	print("-"*20)
	prompt = "The <extra_id_0> walks in <extra_id_1> park"
	encoded_prompt = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt").input_ids
	print(f"encoded_prompt ={encoded_prompt}")
	labels ="<extra_id_0> cute dog <extra_id_1> the <extra_id_2>"
	encoded_labels = tokenizer(labels, truncation=False, padding=False, return_tensors="pt").input_ids
	print(f"encoded_labels ={encoded_labels}")
	print(f"{encoded_prompt.shape} ={encoded_labels.shape}")
	print("-"*20)
	# simulating the above
	prompt = "The cute dog walks in the green park"
	labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
	print(f"denoised input_ids decoded = {tokenizer.decode(*input_ids,skip_special_tokens=False)}")
	print(f"denoised labels decoded = {tokenizer.decode(*labels,skip_special_tokens=False)}")
	print(f"input_ids.shape {input_ids.shape} labels.shape {labels.shape}") # todo should this be equal
	denoised_input_ids = torch.from_numpy(input_ids)
	denoised_labels = torch.from_numpy(labels)
	denoised_attention_mask = torch.ones(input_ids.shape)

	model = T5ForConditionalGeneration.from_pretrained("t5-small")
	optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
	#-------------------------------------------------------------
	# Before training -check model output
	model.eval()
	test_prompt = "The cute dog walks in the"
	encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
	test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=15)
	#pad_token_id=tokenizer.eos_token_id, top_k=50)
	test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
	print(f"Before Training:'{test_prompt}'-->'{test_answer}'")
	#-------------------------------------------------------------
	# Training - Method 1, input_id == label (target)
	# with this the model generates the input_id/label as is after training - so this is not the way
	# test_prompt = "The cute dog walks in the green park"
	# encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
	# outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
	# labels=encoded.input_ids)
	# Method 2 using the denoised way
	# outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
	# labels=denoised_labels)
	# Method 3 - Giving specific target
	test_prompt = "The cute dog walks in the"
	label_prompt = "green park"
	encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
	label = tokenizer(label_prompt, truncation=False, padding=False, return_tensors="pt")
	model.train()
	for epoch in range(100):
	#Method 1 input_id == label (target)
	# outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
	# Method 2 - Using the denoised way
	# outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
	# labels=denoised_labels)
	# Method 3 -Giving specific target
	outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
	labels=label.input_ids)
	loss = outputs.loss
	if epoch % 10 == 0:
	print(f"Epoch {epoch} Loss {loss}")
	loss.backward()
	optimizer.step()
	optimizer.zero_grad()
	print(f"Epoch {epoch} Loss {loss}")
	#-------------------------------------------------------------
	# After training
	model.eval()
	test_prompt = "The cute dog walks in the"
	encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
	test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=3,do_sample=True,max_length=25)
	test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
	print(f"After Training:'{test_prompt}'-->'{test_answer}'")