Skip to content

Instantly share code, notes, and snippets.

@Tengoles
Last active October 18, 2024 20:14
Show Gist options
  • Save Tengoles/e65a588fa72d2e7edb1db5cca6984af7 to your computer and use it in GitHub Desktop.
Save Tengoles/e65a588fa72d2e7edb1db5cca6984af7 to your computer and use it in GitHub Desktop.
from datasets import Dataset
from typing import Dict, Any
from transformers import PreTrainedTokenizer
def openai_jsonl_to_llama(jsonl_path: str, tokenizer: PreTrainedTokenizer) -> Dataset:
"""
Converts a JSONL file with OpenAI's dataset format into a dataset formatted for use with a Llama model by applying a tokenizer's chat template
to the 'conversations' field in each example.
Args:
jsonl_path (str): The file path to the JSONL file containing the data.
tokenizer (PreTrainedTokenizer): A tokenizer with a method to apply a chat template to conversation data.
Returns:
Dataset: A Hugging Face Dataset with the 'conversations' field formatted as text using the tokenizer.
"""
def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
convos = examples["conversations"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
dataset = load_dataset("json", data_files=jsonl_path, split="train")
dataset = dataset.rename_column("messages", "conversations")
dataset = dataset.map(formatting_prompts_func, batched=True)
return dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment