kasperjunge/tokenizer_huggingface_dataset.py

## tokenizer_huggingface_dataset.py
from typing import Union
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

def tokenize_huggingface_dataset(
    ds: Union[Dataset, DatasetDict],
    tokenizer: AutoTokenizer,
    max_length: int = 512,
    truncation: bool = True,
) -> Union[Dataset, DatasetDict]:
    """Tokenize Hugging Face Dataset or DatasetDict.

    Args:
        ds (Union[Dataset, DatasetDict]): Hugging Dataset og DatasetDict.
        tokenizer (AutoTokenizer): Tokenizer.
        max_length (int): Max sequence lenght.
        truncation (bool): Wether to truncate sequences longer than max_length.

    Returns:
        Union[Dataset, DatasetDict]: Dataset with tokenized text.
    """
    def tokenize(example):
        return tokenizer(example["text"], max_length=max_length, truncation=truncation)
    return ds.map(tokenize, batched=True)
	from typing import Union
	from transformers import AutoTokenizer
	from datasets import Dataset, DatasetDict

	def tokenize_huggingface_dataset(
	ds: Union[Dataset, DatasetDict],
	tokenizer: AutoTokenizer,
	max_length: int = 512,
	truncation: bool = True,
	) -> Union[Dataset, DatasetDict]:
	"""Tokenize Hugging Face Dataset or DatasetDict.

	Args:
	ds (Union[Dataset, DatasetDict]): Hugging Dataset og DatasetDict.
	tokenizer (AutoTokenizer): Tokenizer.
	max_length (int): Max sequence lenght.
	truncation (bool): Wether to truncate sequences longer than max_length.

	Returns:
	Union[Dataset, DatasetDict]: Dataset with tokenized text.
	"""
	def tokenize(example):
	return tokenizer(example["text"], max_length=max_length, truncation=truncation)
	return ds.map(tokenize, batched=True)