dlwh/run_clm.py

## run_clm.py
from itertools import chain
from typing import Optional

import numpy as np

import datasets

import torch

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)


def main():
    # doesn't fail if dataloader_num_workers is 0, not using IterableDataset, or not using more than one gpu
    training_args: TrainingArguments = TrainingArguments(output_dir="scr", dataloader_num_workers=1)

    raw_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")

    # model doesn't matter, just something smallish
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    model = AutoModelForCausalLM.from_pretrained("distilgpt2")

    column_names = raw_dataset.column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        output = tokenizer(examples[text_column_name])
        return output

    with training_args.main_process_first(desc="dataset map tokenization"):
        tokenized_dataset = raw_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=column_names,
            desc="Running tokenizer on dataset",
        )

    block_size = tokenizer.model_max_length
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    with training_args.main_process_first(desc="grouping texts together"):
        lm_dataset = tokenized_dataset.map(
            group_texts,
            batched=True,
            num_proc=1,
            desc=f"Grouping texts in chunks of {block_size}",
        )

    class WrapperDataset(torch.utils.data.IterableDataset):
        def __init__(self, ds):
            self.ds = ds

        def __iter__(self):
            return iter(self.ds)

    eval_dataset = WrapperDataset(lm_dataset)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=None,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
    )

    metrics = trainer.evaluate()

    assert np.isfinite(metrics["eval_loss"])


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()
	from itertools import chain
	from typing import Optional

	import numpy as np

	import datasets

	import torch

	import transformers
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	Trainer,
	TrainingArguments,
	default_data_collator,
	)


	def main():
	# doesn't fail if dataloader_num_workers is 0, not using IterableDataset, or not using more than one gpu
	training_args: TrainingArguments = TrainingArguments(output_dir="scr", dataloader_num_workers=1)

	raw_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")

	# model doesn't matter, just something smallish
	tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
	model = AutoModelForCausalLM.from_pretrained("distilgpt2")

	column_names = raw_dataset.column_names
	text_column_name = "text" if "text" in column_names else column_names[0]

	def tokenize_function(examples):
	output = tokenizer(examples[text_column_name])
	return output

	with training_args.main_process_first(desc="dataset map tokenization"):
	tokenized_dataset = raw_dataset.map(
	tokenize_function,
	batched=True,
	num_proc=1,
	remove_columns=column_names,
	desc="Running tokenizer on dataset",
	)

	block_size = tokenizer.model_max_length
	# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
	def group_texts(examples):
	# Concatenate all texts.
	concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
	total_length = len(concatenated_examples[list(examples.keys())[0]])
	# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
	# customize this part to your needs.
	if total_length >= block_size:
	total_length = (total_length // block_size) * block_size
	# Split by chunks of max_len.
	result = {
	k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
	for k, t in concatenated_examples.items()
	}
	result["labels"] = result["input_ids"].copy()
	return result

	with training_args.main_process_first(desc="grouping texts together"):
	lm_dataset = tokenized_dataset.map(
	group_texts,
	batched=True,
	num_proc=1,
	desc=f"Grouping texts in chunks of {block_size}",
	)

	class WrapperDataset(torch.utils.data.IterableDataset):
	def __init__(self, ds):
	self.ds = ds

	def __iter__(self):
	return iter(self.ds)

	eval_dataset = WrapperDataset(lm_dataset)

	# Initialize our Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=None,
	eval_dataset=eval_dataset,
	tokenizer=tokenizer,
	# Data collator will default to DataCollatorWithPadding, so we change it.
	data_collator=default_data_collator,
	)

	metrics = trainer.evaluate()

	assert np.isfinite(metrics["eval_loss"])


	def _mp_fn(index):
	# For xla_spawn (TPUs)
	main()


	if __name__ == "__main__":
	main()