Last active
October 18, 2024 20:14
-
-
Save Tengoles/e65a588fa72d2e7edb1db5cca6984af7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import Dataset | |
from typing import Dict, Any | |
from transformers import PreTrainedTokenizer | |
def openai_jsonl_to_llama(jsonl_path: str, tokenizer: PreTrainedTokenizer) -> Dataset: | |
""" | |
Converts a JSONL file with OpenAI's dataset format into a dataset formatted for use with a Llama model by applying a tokenizer's chat template | |
to the 'conversations' field in each example. | |
Args: | |
jsonl_path (str): The file path to the JSONL file containing the data. | |
tokenizer (PreTrainedTokenizer): A tokenizer with a method to apply a chat template to conversation data. | |
Returns: | |
Dataset: A Hugging Face Dataset with the 'conversations' field formatted as text using the tokenizer. | |
""" | |
def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]: | |
convos = examples["conversations"] | |
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos] | |
return {"text": texts} | |
dataset = load_dataset("json", data_files=jsonl_path, split="train") | |
dataset = dataset.rename_column("messages", "conversations") | |
dataset = dataset.map(formatting_prompts_func, batched=True) | |
return dataset |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment