Skip to content

Instantly share code, notes, and snippets.

@mzbac
Last active May 9, 2024 00:35
Show Gist options
  • Save mzbac/b12f322dbeb7c9ef229d10b18d509951 to your computer and use it in GitHub Desktop.
Save mzbac/b12f322dbeb7c9ef229d10b18d509951 to your computer and use it in GitHub Desktop.
glaiveai/glaive-function-calling-v2 clean up
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments,BitsAndBytesConfig
from datasets import load_dataset
import json
model_name ="meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("glaiveai/glaive-function-calling-v2",split="train")
def cleanup(input_string):
arguments_index = input_string.find('"arguments"')
if arguments_index == -1:
return input_string
start_quote = input_string.find("'", arguments_index)
if start_quote == -1:
return input_string
end_quote = input_string.rfind("'")
if end_quote == -1 or end_quote <= start_quote:
return input_string
arguments_value = input_string[start_quote+1:end_quote]
output_string = input_string[:start_quote] + arguments_value + input_string[end_quote+1:]
return output_string
def formatting_prompts_func(example):
output_texts = []
for i in range(len(example['system'])):
messages = [
{
"role": "system",
"content": example['system'][i][len("SYSTEM:"):].strip(),
},
]
conversations = example['chat'][i].split("<|endoftext|>")
for message in conversations:
continue_outer = False
message = message.strip()
if message:
if "USER:" in message:
user_content = message.split("ASSISTANT:")[0].strip()
messages.append({"role": "user", "content": user_content[5:].strip()})
if "ASSISTANT:" in message:
assistant_content = message.split("ASSISTANT:")[1].strip()
if "<functioncall>" in assistant_content:
text = assistant_content.replace("<functioncall>","").strip()
json_str = cleanup(text)
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"0 - Failed to decode JSON: {json_str} - {assistant_content}")
continue_outer = True
break
new_func_text = "<functioncall> "+ json_str
messages.append({"role": "assistant", "content": new_func_text})
else:
messages.append({"role": "assistant", "content": assistant_content})
elif message.startswith("FUNCTION RESPONSE:"):
function_response = message[18:].strip()
if "ASSISTANT:" in function_response:
function_content, assistant_content = function_response.split("ASSISTANT:")
try:
data = json.loads(function_content.strip())
except json.JSONDecodeError as e:
print(f"1 - Failed to decode JSON: {function_content}")
continue_outer = True
break
messages.append({"role": "user", "content": function_content.strip()})
messages.append({"role": "assistant", "content": assistant_content.strip()})
else:
try:
data = json.loads(function_response.strip())
except json.JSONDecodeError as e:
print(f"2 - Failed to decode JSON: {function_response}")
continue_outer = True
break
messages.append({"role": "user", "content": function_response.strip()})
elif message.startswith("ASSISTANT:"):
assistant_content = message.split("ASSISTANT:")[1].strip()
if "<functioncall>" in assistant_content:
text = assistant_content.replace("<functioncall>","").strip()
json_str = cleanup(text)
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"3 - Failed to decode JSON: {json_str} - {assistant_content}")
continue_outer = True
break
new_func_text = "<functioncall> "+ json_str
messages.append({"role": "assistant", "content": new_func_text})
if continue_outer:
continue
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
output_texts.append(text)
del example['system']
del example['chat']
return {"text": output_texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment