Tengoles/openai_jsonl_to_llama.py

## openai_jsonl_to_llama.py
from datasets import Dataset
from typing import Dict, Any
from transformers import PreTrainedTokenizer

def openai_jsonl_to_llama(jsonl_path: str, tokenizer: PreTrainedTokenizer) -> Dataset:
    """
    Converts a JSONL file with OpenAI's dataset format into a dataset formatted for use with a Llama model by applying a tokenizer's chat template
    to the 'conversations' field in each example.

    Args:
        jsonl_path (str): The file path to the JSONL file containing the data.
        tokenizer (PreTrainedTokenizer): A tokenizer with a method to apply a chat template to conversation data.

    Returns:
        Dataset: A Hugging Face Dataset with the 'conversations' field formatted as text using the tokenizer.
    """

    def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
        convos = examples["conversations"]
        texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
        return {"text": texts}

    dataset = load_dataset("json", data_files=jsonl_path, split="train")
    dataset = dataset.rename_column("messages", "conversations")

    dataset = dataset.map(formatting_prompts_func, batched=True)

    return dataset
	from datasets import Dataset
	from typing import Dict, Any
	from transformers import PreTrainedTokenizer

	def openai_jsonl_to_llama(jsonl_path: str, tokenizer: PreTrainedTokenizer) -> Dataset:
	"""
	Converts a JSONL file with OpenAI's dataset format into a dataset formatted for use with a Llama model by applying a tokenizer's chat template
	to the 'conversations' field in each example.

	Args:
	jsonl_path (str): The file path to the JSONL file containing the data.
	tokenizer (PreTrainedTokenizer): A tokenizer with a method to apply a chat template to conversation data.

	Returns:
	Dataset: A Hugging Face Dataset with the 'conversations' field formatted as text using the tokenizer.
	"""

	def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
	convos = examples["conversations"]
	texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
	return {"text": texts}

	dataset = load_dataset("json", data_files=jsonl_path, split="train")
	dataset = dataset.rename_column("messages", "conversations")

	dataset = dataset.map(formatting_prompts_func, batched=True)

	return dataset