Skip to content

Instantly share code, notes, and snippets.

@dlwh
Created August 12, 2022 18:11
Show Gist options
  • Save dlwh/074e2571fab15f94103603674dd184a3 to your computer and use it in GitHub Desktop.
Save dlwh/074e2571fab15f94103603674dd184a3 to your computer and use it in GitHub Desktop.
bad eval output
from itertools import chain
from typing import Optional
import numpy as np
import datasets
import torch
import transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
default_data_collator,
)
def main():
# doesn't fail if dataloader_num_workers is 0, not using IterableDataset, or not using more than one gpu
training_args: TrainingArguments = TrainingArguments(output_dir="scr", dataloader_num_workers=1)
raw_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
# model doesn't matter, just something smallish
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
column_names = raw_dataset.column_names
text_column_name = "text" if "text" in column_names else column_names[0]
def tokenize_function(examples):
output = tokenizer(examples[text_column_name])
return output
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_dataset = raw_dataset.map(
tokenize_function,
batched=True,
num_proc=1,
remove_columns=column_names,
desc="Running tokenizer on dataset",
)
block_size = tokenizer.model_max_length
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
with training_args.main_process_first(desc="grouping texts together"):
lm_dataset = tokenized_dataset.map(
group_texts,
batched=True,
num_proc=1,
desc=f"Grouping texts in chunks of {block_size}",
)
class WrapperDataset(torch.utils.data.IterableDataset):
def __init__(self, ds):
self.ds = ds
def __iter__(self):
return iter(self.ds)
eval_dataset = WrapperDataset(lm_dataset)
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=None,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
# Data collator will default to DataCollatorWithPadding, so we change it.
data_collator=default_data_collator,
)
metrics = trainer.evaluate()
assert np.isfinite(metrics["eval_loss"])
def _mp_fn(index):
# For xla_spawn (TPUs)
main()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment