Skip to content

Instantly share code, notes, and snippets.

@mzbac
Created April 27, 2024 06:51
Show Gist options
  • Save mzbac/c10ba6b8cad89942c8924a27e82a1455 to your computer and use it in GitHub Desktop.
Save mzbac/c10ba6b8cad89942c8924a27e82a1455 to your computer and use it in GitHub Desktop.
preprocess.py
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments,BitsAndBytesConfig
from datasets import load_dataset
model_name ="meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("glaiveai/glaive-function-calling-v2",split="train")
def formatting_prompts_func(example):
output_texts = []
for i in range(len(example['system'])):
messages = [
{
"role": "system",
"content": example['system'][i][len("SYSTEM:"):].strip(),
},
]
conversations = example['chat'][i].split("<|endoftext|>")
for message in conversations:
message = message.strip()
if message:
if "USER:" in message:
user_content = message.split("ASSISTANT:")[0].strip()
messages.append({"role": "user", "content": user_content[5:].strip()})
if "ASSISTANT:" in message:
assistant_content = message.split("ASSISTANT:")[1].strip()
messages.append({"role": "assistant", "content": assistant_content})
elif message.startswith("FUNCTION RESPONSE:"):
function_response = message[18:].strip()
if "ASSISTANT:" in function_response:
function_content, assistant_content = function_response.split("ASSISTANT:")
messages.append({"role": "user", "content": function_content.strip()})
messages.append({"role": "assistant", "content": assistant_content.strip()})
else:
messages.append({"role": "user", "content": function_response})
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
output_texts.append(text)
return {"text": output_texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
dataset = dataset.remove_columns(["system", "chat"])
dataset.push_to_hub("mzbac/glaive-function-calling-v2-llama-3-format")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment