nebrelbug/get_data.py

## get_data.py
from datasets import load_dataset

IGNORE_TOKEN = -100

#####################
# FORMAT DATA       #
#####################

template_context = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

template_no_context = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""

def data_to_string(data):

    instruction = data["instruction"]
    context = data["input"]
    response = data["output"]

    template = template_context if len(context) > 0 else template_no_context

    source = template.format(instruction=instruction, input=context)

    return {
        "source": source,
        "text": source + response,
    }

original_dataset = load_dataset("tatsu-lab/alpaca")["train"]

dataset = original_dataset.map(
    data_to_string
).remove_columns(['instruction', 'input', 'output'])

#####################
# SPLIT DATA        #
#####################

processed_dataset = dataset.train_test_split(test_size=0.1)

train_dataset = processed_dataset["train"]
eval_dataset = processed_dataset["test"]

#####################
# CREATE DATALOADER #
#####################

def data_collator(features, tokenizer):
    sources = [feature["source"] for feature in features]
    targets = [feature["text"] for feature in features]

    source_tokens = tokenizer(
        sources,
        return_tensors="pt",
        padding='longest',
        max_length=None,
    )

    target_tokens = tokenizer(
        targets,
        return_tensors="pt",
        padding='longest',
        max_length=None,
    )

    labels = target_tokens["input_ids"].clone()

    for i in range(len(labels)):
        source_len = source_tokens["attention_mask"][i].sum()

        labels[i, :source_len] = IGNORE_TOKEN

    res = {
        "input_ids": target_tokens["input_ids"],
        "attention_mask": target_tokens["attention_mask"],
        "labels": labels,
    }

    return res
	from datasets import load_dataset

	IGNORE_TOKEN = -100

	#####################
	# FORMAT DATA #
	#####################

	template_context = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

	### Instruction:
	{instruction}

	### Input:
	{input}

	### Response:
	"""

	template_no_context = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

	### Instruction:
	{instruction}

	### Response:
	"""

	def data_to_string(data):

	instruction = data["instruction"]
	context = data["input"]
	response = data["output"]

	template = template_context if len(context) > 0 else template_no_context

	source = template.format(instruction=instruction, input=context)

	return {
	"source": source,
	"text": source + response,
	}

	original_dataset = load_dataset("tatsu-lab/alpaca")["train"]

	dataset = original_dataset.map(
	data_to_string
	).remove_columns(['instruction', 'input', 'output'])

	#####################
	# SPLIT DATA #
	#####################

	processed_dataset = dataset.train_test_split(test_size=0.1)

	train_dataset = processed_dataset["train"]
	eval_dataset = processed_dataset["test"]

	#####################
	# CREATE DATALOADER #
	#####################

	def data_collator(features, tokenizer):
	sources = [feature["source"] for feature in features]
	targets = [feature["text"] for feature in features]

	source_tokens = tokenizer(
	sources,
	return_tensors="pt",
	padding='longest',
	max_length=None,
	)

	target_tokens = tokenizer(
	targets,
	return_tensors="pt",
	padding='longest',
	max_length=None,
	)

	labels = target_tokens["input_ids"].clone()

	for i in range(len(labels)):
	source_len = source_tokens["attention_mask"][i].sum()

	labels[i, :source_len] = IGNORE_TOKEN

	res = {
	"input_ids": target_tokens["input_ids"],
	"attention_mask": target_tokens["attention_mask"],
	"labels": labels,
	}

	return res