Skip to content

Instantly share code, notes, and snippets.

@bdashore3
Last active September 11, 2023 19:29
Show Gist options
  • Save bdashore3/4c9f3a812c1a68013fdb23e1179c7765 to your computer and use it in GitHub Desktop.
Save bdashore3/4c9f3a812c1a68013fdb23e1179c7765 to your computer and use it in GitHub Desktop.
LimaRP-ShareGPT
import glob
import re
import os.path
import statistics
import random
import pandas
import yaml
from transformers import LlamaTokenizer, AutoTokenizer
# Training examples designed for 4k context size, used for the initial ~1000
# samples release of LimaRP.
files = glob.glob('./data/**/*.yaml', recursive=True)
outfile = 'out/train4k.jsonl'
# Training examples designed for 8k context size. Avoid using with a 4k context
# size (in other words, limiting data length to 4096 tokens using the option below),
# as character personas and scenario may end up not being accurate to the context.
# files = glob.glob('./data-long/**/*.yaml', recursive=True)
# outfile = 'out/train8k.jsonl'
# Old evals. These are generally lower-quality training examples removed from the
# initially-made 4k dataset, or training examples with unwanted issues.
# files = glob.glob('./data-evals/**/*.yaml', recursive=True)
# outfile = '/home/anon/bin/qlora/piper/eval.jsonl'
# The pretrained model path is needed for using its tokenizer
pretrained_model_path = 'F:\AI\models\meta-llama_Llama-2-13b-hf'
# Try to limit training example length by removing early messages, without clipping
# them. This works better if the RP messages in the conversations aren't too long,
# otherwise training examples can end up being significantly shorter than the limit.
limit_data = False
limit_data_length = 4096
# This changes the way the data is arranged in the output json files, affecting
# model prediction during training in subtle ways. Supported formats:
#
# 'output_only'
# everything on the output like Guanaco (similar to unsupervised tuning).
# 'bot_output'
# system+conversation on the input, last bot response on the output (same as Pygmalion).
# 'system_input'
# system on the input, entire conversation on the output.
train_format = 'output_only'
# Use original character names instead of replacing them with alternative labels.
use_original_names = True
# Alternative labels to prepend at the start of the utterance, e.g. `USER:`
label_user = 'USER'
label_bot = 'CHAR'
# Alternative labels to use inside the utterance
placeholder_user = 'USER'
placeholder_bot = 'CHAR'
# Various instruct and model sequences. Original LimaRP format.
seq_system = '<<SYSTEM>>'
seq_human = '<<HUMAN>>'
seq_aibot = '<<AIBOT>>'
# Same, but Alpaca format. Doesn't appear to work as well as the LimaRP format.
seq_system = '### Instruction:'
seq_human = '### Input:'
seq_aibot = '### Response:'
# Character placeholders baked in the files; **DO NOT CHANGE**
placeholder_user_old = '<SECOND>'
placeholder_bot_old = '<FIRST>'
# System prompt options
two_char_system_prompts = [
"""Enter roleplay mode. You are currently %{having a conversation|in conversation|in a roleplay chat} with <SECOND>, whose %{traits are|persona is|characteristics are}:
<SECOND PERSONA>
%{You are|Play the role of|Take the role of} <FIRST> with the following %{persona|definitions|character sheet|traits}:
<FIRST PERSONA>
%{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}:
<SCENARIO>""",
"""<SECOND>'s Persona: <SECOND PERSONA>
<FIRST>'s Persona: <FIRST PERSONA>
You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions} for <FIRST>, you must engage in %{a roleplay conversation|an RP chat} with <SECOND>. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}:
<SCENARIO>""",
]
single_char_system_prompts = [
"""Enter roleplay mode. %{You are|Play the role of|Take the role of|Become the character} <FIRST> with the following %{persona|definitions|character sheet|traits}:
<FIRST PERSONA>
%{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}:
<SCENARIO>""",
"""<FIRST>'s Persona: <FIRST PERSONA>
You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions}, you must engage in %{a roleplay conversation|an RP chat}. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}:
<SCENARIO>"""
]
# Use two-character or one-character system prompts
use_two_char_sys = True
# -1 = both prompts, 0 = system prompt A, 1 = system prompt B
system_prompt_index = 1
tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_path)
def substitute_participants(input_string):
'''
Replace placeholder usernames with different names
TODO: the implementation could be improved by *not* relying on external variables.
'''
#if is_system:
input_string = input_string.replace("<FIRST PERSONA>", source['persona']['<FIRST>'])
input_string = input_string.replace("<SECOND PERSONA>", source['persona']['<SECOND>'])
input_string = input_string.replace("<SCENARIO>", source['scenario'])
input_string = input_string.replace("<SECOND>", placeholder_user)
input_string = input_string.replace("<FIRST>", placeholder_bot)
#if is_system:
#print(input_string)
return input_string
def fix_punctuation(input_string):
'''
Replace fancy/incorrect punctuation with simpler/correct one
TODO: more effective regexes, options for controlling what should be changed.
'''
# Fix excessive horizontal whitespace. This should go before everything else.
input_string = re.sub(r' {2,}', ' ', input_string)
# General puncuation fixes
input_string = input_string.replace(' !', '!')
input_string = input_string.replace(' ?', '?')
input_string = input_string.replace('’', "'")
input_string = input_string.replace('‘', "'")
input_string = input_string.replace('“', '"')
input_string = input_string.replace('”', '"')
input_string = input_string.replace('…', '...')
# Replace em-dash surrogates `---` in the source files with actual
# em-dashes, since some people apparently dislike them.
input_string = input_string.replace('---', '—')
# Fix incorrect ellipsis. This should preferably be fixed in the
# source files themselves
input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string)
input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string)
return input_string
def detect_single_newlines(input_string):
'''
Single newlines are most of the time unwanted
'''
# Removing double newlines first
input_string = input_string.replace('\n\n', '||')
# Now detect single newline presence
if input_string.find('\n') >= 0:
return True
else:
return False
def validate_placeholders(input_string, placeholder_list):
'''
Verifies that placeholder (baked in the files) have been used correctly.
They must not be open and not have typos.
'''
search_placeholders = re.findall('<.*?>', input_string)
if search_placeholders:
# If the variable is nonzero, there are strings detected as placeholders.
for item in search_placeholders:
# Compare the strings to the placeholders. They must be the same.
if item not in placeholder_list:
return False
# A naive check is counting the number of '<' and '>'. They must match, although
# this means that these symbols cannot be used for other things, which was already
# the assumption anyway.
str_lt = input_string.count('<')
str_gt = input_string.count('>')
if str_lt != str_gt:
return False
# All checks passed
return True
def get_prompt(prompt_string):
pattern = re.compile(r'%{(.+?)}')
for m in re.finditer(pattern, prompt_string):
match = m.group(0)
replace = random.choice(m.group(1).split("|"))
prompt_string = prompt_string.replace(match, replace)
return prompt_string
message_count = 0
entries = []
token_lengths = []
count = 0
for num, file in enumerate(files):
with open(file, 'r', encoding='utf-8') as f:
source = yaml.safe_load(f)
if use_original_names:
label_user = source['names']['<SECOND>']
label_bot = source['names']['<FIRST>']
placeholder_user = source['names']['<SECOND>']
placeholder_bot = source['names']['<FIRST>']
if label_user is None:
raise NameError(f"{file}: USER must have a name.")
elif label_bot is None:
raise NameError(f"{file}: CHAR must have a name.")
elif (3 > len(label_user) > 20) or (3 > len(label_bot) > 20):
raise NameError(f"{file}: unusual character name length: possible issue.")
elif ('SECOND' in source['names']['<SECOND>']) or ('FIRST' in source['names']['<FIRST>']):
# No need to check for the exact string here, just if there's a clearly
# defective name that likely resulted from user error while manually
# processing the files.
raise NameError(f"{file}: Incorrect character names.")
elif (source['persona']['<FIRST>'] is None) or (len(source['persona']['<FIRST>']) < 20):
raise SyntaxError(f'{file}: no persona defined for <FIRST>.')
elif (source['persona']['<SECOND>'] is None) or (len(source['persona']['<SECOND>']) < 20):
raise SyntaxError(f'{file}: no persona defined for <SECOND>.')
elif source['scenario'] is None:
raise SyntaxError(f'{file}: Scenario missing.')
elif len(source['scenario']) < 100:
raise SyntaxError(f'{file}: Probable error in scenario (too short).')
# Perform various per-message checks
for message in source['conversation']:
# Check if there are open (unpaired) quotation marks
quotation_marks = 0
quotation_marks += message['text'].count('"')
if (quotation_marks % 2) != 0:
print((f'\n{file}: Open quotation marks\n{message["text"]}\n\n'))
# Check if there are unpaired asterisks (for inner thoughts, etc)
paired_asterisks = 0
paired_asterisks += message['text'].count('*')
if (paired_asterisks % 2) != 0:
print((f'\n{file}: Unpaired asterisks\n{message["text"]}\n\n'))
# Check for single newlines
if detect_single_newlines(message['text']):
print((f'\n{file}: Single newline\n{message["text"]}\n\n'))
# Check if placeholders have been used correctly in the messages
if not validate_placeholders(message['text'], [placeholder_user_old, placeholder_bot_old]):
print((f'\n{file}: Incorrect placeholder\n{message["text"]}\n\n'))
# Storing the length of BOT's messages in words.
message_lengths = []
if message['name'] == '<FIRST>':
message_lengths.append(len(message['text'].split()))
# Check if placeholders have been used correctly in personas and scenarios
if not validate_placeholders(source['scenario'], [placeholder_user_old, placeholder_bot_old]):
print((f'\n{file}: Incorrect placeholder in scenario (or: < > not allowed)\n{source["scenario"]}\n\n'))
if not validate_placeholders(source['persona']['<FIRST>'], [placeholder_user_old, placeholder_bot_old]):
print((f'\n{file}: Incorrect placeholder in <FIRST> Persona (or: < > not allowed)\n{source["persona"]["<FIRST>"]}\n\n'))
if not validate_placeholders(source["persona"]["<SECOND>"], [placeholder_user_old, placeholder_bot_old]):
print((f'\n{file}: Incorrect placeholder in <SECOND> Persona (or: < > not allowed)\n{source["persona"]["<SECOND>"]}\n\n'))
# This section tries to limit the total number of token below a predefined limit.
# It operates on a message basis. The current algorithm slows down parsing as
# it repeats tokenization - it was made quickly to test the idea.
if limit_data:
# First = GPT, Second = Human
tokens_persona_1 = len(tokenizer(source['persona']['<FIRST>'])['input_ids'])
tokens_persona_2 = len(tokenizer(source['persona']['<SECOND>'])['input_ids'])
tokens_scenario = len(tokenizer(source['scenario'])['input_ids'])
tokens_extra = 20 + 45 # Extra tokens to take into account newlines and and preamble
tokens_header = tokens_persona_1 + tokens_persona_2 + tokens_scenario + tokens_extra
tokens_messages = []
total_tokens = tokens_header
for message in source['conversation']:
message_tokens = 0
message_tokens += 16 # Worst-case scenario for newlines, system sequences and character names
message_tokens += len(tokenizer(message['text'])['input_ids'])
tokens_messages.append(message_tokens)
# Naive search for the smallest subset of messages whose token length
# summed with that of the header is lower than the limit.
for i_start in range(0, len(source['conversation']) - 1):
totaltokens = tokens_header + sum(tokens_messages[i_start:])
if totaltokens < limit_data_length:
break
if totaltokens > limit_data_length:
# Drop the messages if they exceed the threshold
print(f'[{num+1}]:DROP', end=' ')
continue
else:
# To be used as a starting index for the next big loop
i_start = 0
sharegpt_text = {
"roles": [label_user, label_bot],
"conversations": []
}
current_user = None
previous_user = None
if not limit_data:
assert i_start == 0
# Adding messages for composing the input (prompt and chat history). Make
# sure to skip the last message, because that's to be used for the output.
# Unless reducing message length, i_start *must* be zero.
for message in source['conversation']:
message_count += 1
current_user = message['name']
message_text = ""
if previous_user == current_user:
# Message usernames must alternate. Error out if they don't, as this indicates a problem
raise ValueError(f'{file}: Consecutive messages from the same person or incorrect indentation.')
if len(message['text']) < 25:
# An arbitrarily low threshold that should include most copy/paste errors.
raise ValueError(f'{file}: Unusually short message in the conversation.')
message_text = message["text"].strip()
message_text = substitute_participants(message_text)
message_text = fix_punctuation(message_text)
sharegpt_text["conversations"].append({
"from": "gpt" if message["name"] == "<FIRST>" else "human",
"value": re.sub(r'[\n]+', '', message_text)
})
previous_user = current_user
average_message_length = statistics.mean(message_lengths)
header = []
header.append(f'{seq_system}')
header.append(f"{placeholder_bot}'s Persona: {source['persona']['<FIRST>']}\n")
header.append(f"{placeholder_user}'s Persona: {source['persona']['<SECOND>']}\n")
header.append(f"Scenario: {source['scenario']}\n")
selected_prompt_array = two_char_system_prompts if use_two_char_sys else single_char_system_prompts
prompt_string = random.choice(selected_prompt_array) if system_prompt_index == -1 else selected_prompt_array[system_prompt_index]
prompt_string = substitute_participants(prompt_string)
prompt_string = fix_punctuation(prompt_string)
sharegpt_text['conversations'].insert(0, {
"from": "system",
"value": get_prompt(prompt_string)
})
# XXX: this must be taken elsewhere as it repeats the tokenization performed
# initially to limit message length.
# len_total = len(tokenizer(header + conversation_text)['input_ids'])
# Since the limiting process has some wiggle room, make sure that the
# final data doesn't exceed the length limit
# if len_total > limit_data_length:
# raise OverflowError(f"{file} exceeds the context length limit")
# token_lengths.append(len_total)
entry = sharegpt_text
entries.append(entry)
# break
# print(f'[{num+1}]:{text_average_message_length}:{len_total}', end=' ')
print(f"\n\nTotal conversations: {len(entries)}\nTotal messages: {message_count}\n")
# f"Longest sequence length: {max(token_lengths)} tokens\n"
# f"Mean sequence length: {statistics.mean(token_lengths):.1f} tokens\n"
# f"Total training tokens: {sum(token_lengths):,} tokens")
# Create a dataframe and shuffle it, resetting the index.
df = pandas.DataFrame(entries)
df = df.sample(frac=1).reset_index(drop=True)
df.to_json(outfile, orient='records', lines=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment