bdashore3/dataprepare-sharegpt.py

## dataprepare-sharegpt.py
import glob
import re
import os.path
import statistics
import random

import pandas
import yaml

from transformers import LlamaTokenizer, AutoTokenizer

# Training examples designed for 4k context size, used for the initial ~1000
# samples release of LimaRP.
files = glob.glob('./data/**/*.yaml', recursive=True)
outfile = 'out/train4k.jsonl'

# Training examples designed for 8k context size. Avoid using with a 4k context
# size (in other words, limiting data length to 4096 tokens using the option below),
# as character personas and scenario may end up not being accurate to the context.
# files = glob.glob('./data-long/**/*.yaml', recursive=True)
# outfile = 'out/train8k.jsonl'

# Old evals. These are generally lower-quality training examples removed from the
# initially-made 4k dataset, or training examples with unwanted issues.
# files = glob.glob('./data-evals/**/*.yaml', recursive=True)
# outfile = '/home/anon/bin/qlora/piper/eval.jsonl'


# The pretrained model path is needed for using its tokenizer
pretrained_model_path = 'F:\AI\models\meta-llama_Llama-2-13b-hf'

# Try to limit training example length by removing early messages, without clipping
# them. This works better if the RP messages in the conversations aren't too long,
# otherwise training examples can end up being significantly shorter than the limit.
limit_data = False
limit_data_length = 4096

# This changes the way the data is arranged in the output json files, affecting
# model prediction during training in subtle ways. Supported formats:
#
# 'output_only'
#       everything on the output like Guanaco (similar to unsupervised tuning).
# 'bot_output'
#       system+conversation on the input, last bot response on the output (same as Pygmalion).
# 'system_input'
#       system on the input, entire conversation on the output.
train_format = 'output_only'

# Use original character names instead of replacing them with alternative labels.
use_original_names = True

# Alternative labels to prepend at the start of the utterance, e.g. `USER:`
label_user = 'USER'
label_bot = 'CHAR'

# Alternative labels to use inside the utterance
placeholder_user = 'USER'
placeholder_bot = 'CHAR'

# Various instruct and model sequences. Original LimaRP format.
seq_system = '<<SYSTEM>>'
seq_human = '<<HUMAN>>'
seq_aibot = '<<AIBOT>>'

# Same, but Alpaca format. Doesn't appear to work as well as the LimaRP format.
seq_system = '### Instruction:'
seq_human = '### Input:'
seq_aibot = '### Response:'

# Character placeholders baked in the files; **DO NOT CHANGE**
placeholder_user_old = '<SECOND>'
placeholder_bot_old = '<FIRST>'

# System prompt options
two_char_system_prompts = [
    """Enter roleplay mode. You are currently %{having a conversation|in conversation|in a roleplay chat} with <SECOND>, whose %{traits are|persona is|characteristics are}:
    <SECOND PERSONA>
    %{You are|Play the role of|Take the role of} <FIRST> with the following %{persona|definitions|character sheet|traits}:
    <FIRST PERSONA>
    %{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}:
    <SCENARIO>""",

    """<SECOND>'s Persona: <SECOND PERSONA>
    <FIRST>'s Persona: <FIRST PERSONA>
    You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions} for <FIRST>, you must engage in %{a roleplay conversation|an RP chat} with <SECOND>. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}:
    <SCENARIO>""",
]

single_char_system_prompts = [
    """Enter roleplay mode. %{You are|Play the role of|Take the role of|Become the character} <FIRST> with the following %{persona|definitions|character sheet|traits}:
    <FIRST PERSONA>
    %{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}:
    <SCENARIO>""",

    """<FIRST>'s Persona: <FIRST PERSONA>
    You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions}, you must engage in %{a roleplay conversation|an RP chat}. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}:
    <SCENARIO>"""
]

# Use two-character or one-character system prompts
use_two_char_sys = True

# -1 = both prompts, 0 = system prompt A, 1 = system prompt B
system_prompt_index = 1

tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_path)


def substitute_participants(input_string):
    '''
    Replace placeholder usernames with different names
    TODO: the implementation could be improved by *not* relying on external variables.
    '''
    #if is_system:
    input_string = input_string.replace("<FIRST PERSONA>", source['persona']['<FIRST>'])
    input_string = input_string.replace("<SECOND PERSONA>", source['persona']['<SECOND>'])
    input_string = input_string.replace("<SCENARIO>", source['scenario'])
    input_string = input_string.replace("<SECOND>", placeholder_user)
    input_string = input_string.replace("<FIRST>", placeholder_bot)
    #if is_system:
        #print(input_string)

    return input_string


def fix_punctuation(input_string):
    '''
    Replace fancy/incorrect punctuation with simpler/correct one
    TODO: more effective regexes, options for controlling what should be changed.
    '''

    # Fix excessive horizontal whitespace. This should go before everything else.
    input_string = re.sub(r' {2,}', ' ', input_string)

    # General puncuation fixes
    input_string = input_string.replace(' !', '!')
    input_string = input_string.replace(' ?', '?')
    input_string = input_string.replace('’', "'")
    input_string = input_string.replace('‘', "'")
    input_string = input_string.replace('“', '"')
    input_string = input_string.replace('”', '"')
    input_string = input_string.replace('…', '...')

    # Replace em-dash surrogates `---` in the source files with actual
    # em-dashes, since some people apparently dislike them.
    input_string = input_string.replace('---', '—')

    # Fix incorrect ellipsis. This should preferably be fixed in the
    # source files themselves
    input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string)
    input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string)

    return input_string


def detect_single_newlines(input_string):
    '''
    Single newlines are most of the time unwanted
    '''

    # Removing double newlines first
    input_string = input_string.replace('\n\n', '||')

    # Now detect single newline presence
    if input_string.find('\n') >= 0:
        return True
    else:
        return False


def validate_placeholders(input_string, placeholder_list):
    '''
    Verifies that placeholder (baked in the files) have been used correctly.
    They must not be open and not have typos.
    '''
    search_placeholders = re.findall('<.*?>', input_string)
    if search_placeholders:
        # If the variable is nonzero, there are strings detected as placeholders.
        for item in search_placeholders:
            # Compare the strings to the placeholders. They must be the same.
            if item not in placeholder_list:
                return False

    # A naive check is counting the number of '<' and '>'. They must match, although
    # this means that these symbols cannot be used for other things, which was already
    # the assumption anyway.
    str_lt = input_string.count('<')
    str_gt = input_string.count('>')
    if str_lt != str_gt:
        return False

    # All checks passed
    return True

def get_prompt(prompt_string):
    pattern = re.compile(r'%{(.+?)}')

    for m in re.finditer(pattern, prompt_string):
        match = m.group(0)
        replace = random.choice(m.group(1).split("|"))
        prompt_string = prompt_string.replace(match, replace)

    return prompt_string

message_count = 0
entries = []
token_lengths = []
count = 0
for num, file in enumerate(files):
    with open(file, 'r', encoding='utf-8') as f:
        source = yaml.safe_load(f)

        if use_original_names:
            label_user = source['names']['<SECOND>']
            label_bot = source['names']['<FIRST>']
            placeholder_user = source['names']['<SECOND>']
            placeholder_bot = source['names']['<FIRST>']

        if label_user is None:
            raise NameError(f"{file}: USER must have a name.")
        elif label_bot is None:
            raise NameError(f"{file}: CHAR must have a name.")
        elif (3 > len(label_user) > 20) or (3 > len(label_bot) > 20):
            raise NameError(f"{file}: unusual character name length: possible issue.")
        elif ('SECOND' in source['names']['<SECOND>']) or ('FIRST' in source['names']['<FIRST>']):
            # No need to check for the exact string here, just if there's a clearly
            # defective name that likely resulted from user error while manually
            # processing the files.
            raise NameError(f"{file}: Incorrect character names.")
        elif (source['persona']['<FIRST>'] is None) or (len(source['persona']['<FIRST>']) < 20):
            raise SyntaxError(f'{file}: no persona defined for <FIRST>.')
        elif (source['persona']['<SECOND>'] is None) or (len(source['persona']['<SECOND>']) < 20):
            raise SyntaxError(f'{file}: no persona defined for <SECOND>.')
        elif source['scenario'] is None:
            raise SyntaxError(f'{file}: Scenario missing.')
        elif len(source['scenario']) < 100:
            raise SyntaxError(f'{file}: Probable error in scenario (too short).')

        # Perform various per-message checks
        for message in source['conversation']:
            # Check if there are open (unpaired) quotation marks
            quotation_marks = 0
            quotation_marks += message['text'].count('"')
            if (quotation_marks % 2) != 0:
                print((f'\n{file}: Open quotation marks\n{message["text"]}\n\n'))

            # Check if there are unpaired asterisks (for inner thoughts, etc)
            paired_asterisks = 0
            paired_asterisks += message['text'].count('*')
            if (paired_asterisks % 2) != 0:
                print((f'\n{file}: Unpaired asterisks\n{message["text"]}\n\n'))

            # Check for single newlines
            if detect_single_newlines(message['text']):
                print((f'\n{file}: Single newline\n{message["text"]}\n\n'))

            # Check if placeholders have been used correctly in the messages
            if not validate_placeholders(message['text'], [placeholder_user_old, placeholder_bot_old]):
                print((f'\n{file}: Incorrect placeholder\n{message["text"]}\n\n'))

            # Storing the length of BOT's messages in words.
            message_lengths = []
            if message['name'] == '<FIRST>':
                message_lengths.append(len(message['text'].split()))

        # Check if placeholders have been used correctly in personas and scenarios
        if not validate_placeholders(source['scenario'], [placeholder_user_old, placeholder_bot_old]):
            print((f'\n{file}: Incorrect placeholder in scenario (or: < > not allowed)\n{source["scenario"]}\n\n'))
        if not validate_placeholders(source['persona']['<FIRST>'], [placeholder_user_old, placeholder_bot_old]):
            print((f'\n{file}: Incorrect placeholder in <FIRST> Persona (or: < > not allowed)\n{source["persona"]["<FIRST>"]}\n\n'))
        if not validate_placeholders(source["persona"]["<SECOND>"], [placeholder_user_old, placeholder_bot_old]):
            print((f'\n{file}: Incorrect placeholder in <SECOND> Persona (or: < > not allowed)\n{source["persona"]["<SECOND>"]}\n\n'))

        # This section tries to limit the total number of token below a predefined limit.
        # It operates on a message basis. The current algorithm slows down parsing as
        # it repeats tokenization - it was made quickly to test the idea.
        if limit_data:
            # First = GPT, Second = Human
            tokens_persona_1 = len(tokenizer(source['persona']['<FIRST>'])['input_ids'])
            tokens_persona_2 = len(tokenizer(source['persona']['<SECOND>'])['input_ids'])
            tokens_scenario = len(tokenizer(source['scenario'])['input_ids'])
            tokens_extra = 20 + 45 # Extra tokens to take into account newlines and and preamble
            tokens_header = tokens_persona_1 + tokens_persona_2 + tokens_scenario + tokens_extra


            tokens_messages = []
            total_tokens = tokens_header
            for message in source['conversation']:
                message_tokens = 0
                message_tokens += 16 # Worst-case scenario for newlines, system sequences and character names
                message_tokens += len(tokenizer(message['text'])['input_ids'])
                tokens_messages.append(message_tokens)

            # Naive search for the smallest subset of messages whose token length
            # summed with that of the header is lower than the limit.
            for i_start in range(0, len(source['conversation']) - 1):
                totaltokens = tokens_header + sum(tokens_messages[i_start:])
                if totaltokens < limit_data_length:
                    break

            if totaltokens > limit_data_length:
                # Drop the messages if they exceed the threshold
                print(f'[{num+1}]:DROP', end=' ')
                continue

        else:
            # To be used as a starting index for the next big loop
            i_start = 0


        sharegpt_text = {
            "roles": [label_user, label_bot],
            "conversations": []
        }
        current_user = None
        previous_user = None
        if not limit_data:
            assert i_start == 0

        # Adding messages for composing the input (prompt and chat history). Make
        # sure to skip the last message, because that's to be used for the output.
        # Unless reducing message length, i_start *must* be zero.
        for message in source['conversation']:
            message_count += 1
            current_user = message['name']
            message_text = ""

            if previous_user == current_user:
                # Message usernames must alternate. Error out if they don't, as this indicates a problem
                raise ValueError(f'{file}: Consecutive messages from the same person or incorrect indentation.')

            if len(message['text']) < 25:
                # An arbitrarily low threshold that should include most copy/paste errors.
                raise ValueError(f'{file}: Unusually short message in the conversation.')

            message_text = message["text"].strip()
            message_text = substitute_participants(message_text)
            message_text = fix_punctuation(message_text)
            sharegpt_text["conversations"].append({
                "from": "gpt" if message["name"] == "<FIRST>" else "human",
                "value": re.sub(r'[\n]+', '', message_text)
            })

            previous_user = current_user

        average_message_length = statistics.mean(message_lengths)

        header = []
        header.append(f'{seq_system}')
        header.append(f"{placeholder_bot}'s Persona: {source['persona']['<FIRST>']}\n")
        header.append(f"{placeholder_user}'s Persona: {source['persona']['<SECOND>']}\n")
        header.append(f"Scenario: {source['scenario']}\n")

        selected_prompt_array = two_char_system_prompts if use_two_char_sys else single_char_system_prompts
        prompt_string = random.choice(selected_prompt_array) if system_prompt_index == -1 else selected_prompt_array[system_prompt_index]
        prompt_string = substitute_participants(prompt_string)
        prompt_string = fix_punctuation(prompt_string)
        sharegpt_text['conversations'].insert(0, {
            "from": "system",
            "value": get_prompt(prompt_string)
        })

        # XXX: this must be taken elsewhere as it repeats the tokenization performed
        # initially to limit message length.
        # len_total = len(tokenizer(header + conversation_text)['input_ids'])

        # Since the limiting process has some wiggle room, make sure that the
        # final data doesn't exceed the length limit
        # if len_total > limit_data_length:
            # raise OverflowError(f"{file} exceeds the context length limit")

        # token_lengths.append(len_total)

        entry = sharegpt_text
        entries.append(entry)
        # break
        # print(f'[{num+1}]:{text_average_message_length}:{len_total}', end=' ')

print(f"\n\nTotal conversations: {len(entries)}\nTotal messages: {message_count}\n")
    # f"Longest sequence length: {max(token_lengths)} tokens\n"
    # f"Mean sequence length: {statistics.mean(token_lengths):.1f} tokens\n"
    # f"Total training tokens: {sum(token_lengths):,} tokens")

# Create a dataframe and shuffle it, resetting the index.
df = pandas.DataFrame(entries)
df = df.sample(frac=1).reset_index(drop=True)
df.to_json(outfile, orient='records', lines=True)
	import glob
	import re
	import os.path
	import statistics
	import random

	import pandas
	import yaml

	from transformers import LlamaTokenizer, AutoTokenizer

	# Training examples designed for 4k context size, used for the initial ~1000
	# samples release of LimaRP.
	files = glob.glob('./data/*/.yaml', recursive=True)
	outfile = 'out/train4k.jsonl'

	# Training examples designed for 8k context size. Avoid using with a 4k context
	# size (in other words, limiting data length to 4096 tokens using the option below),
	# as character personas and scenario may end up not being accurate to the context.
	# files = glob.glob('./data-long/*/.yaml', recursive=True)
	# outfile = 'out/train8k.jsonl'

	# Old evals. These are generally lower-quality training examples removed from the
	# initially-made 4k dataset, or training examples with unwanted issues.
	# files = glob.glob('./data-evals/*/.yaml', recursive=True)
	# outfile = '/home/anon/bin/qlora/piper/eval.jsonl'


	# The pretrained model path is needed for using its tokenizer
	pretrained_model_path = 'F:\AI\models\meta-llama_Llama-2-13b-hf'

	# Try to limit training example length by removing early messages, without clipping
	# them. This works better if the RP messages in the conversations aren't too long,
	# otherwise training examples can end up being significantly shorter than the limit.
	limit_data = False
	limit_data_length = 4096

	# This changes the way the data is arranged in the output json files, affecting
	# model prediction during training in subtle ways. Supported formats:
	#
	# 'output_only'
	# everything on the output like Guanaco (similar to unsupervised tuning).
	# 'bot_output'
	# system+conversation on the input, last bot response on the output (same as Pygmalion).
	# 'system_input'
	# system on the input, entire conversation on the output.
	train_format = 'output_only'

	# Use original character names instead of replacing them with alternative labels.
	use_original_names = True

	# Alternative labels to prepend at the start of the utterance, e.g. `USER:`
	label_user = 'USER'
	label_bot = 'CHAR'

	# Alternative labels to use inside the utterance
	placeholder_user = 'USER'
	placeholder_bot = 'CHAR'

	# Various instruct and model sequences. Original LimaRP format.
	seq_system = '<<SYSTEM>>'
	seq_human = '<<HUMAN>>'
	seq_aibot = '<<AIBOT>>'

	# Same, but Alpaca format. Doesn't appear to work as well as the LimaRP format.
	seq_system = '### Instruction:'
	seq_human = '### Input:'
	seq_aibot = '### Response:'

	# Character placeholders baked in the files; DO NOT CHANGE
	placeholder_user_old = '<SECOND>'
	placeholder_bot_old = '<FIRST>'

	# System prompt options
	two_char_system_prompts = [
	"""Enter roleplay mode. You are currently %{having a conversation\|in conversation\|in a roleplay chat} with <SECOND>, whose %{traits are\|persona is\|characteristics are}:
	<SECOND PERSONA>
	%{You are\|Play the role of\|Take the role of} <FIRST> with the following %{persona\|definitions\|character sheet\|traits}:
	<FIRST PERSONA>
	%{In addition\|Additionally\|Also}, %{keep the following scenario in mind\|remember this scenario\|pay attention to this scenario}:
	<SCENARIO>""",

	"""<SECOND>'s Persona: <SECOND PERSONA>
	<FIRST>'s Persona: <FIRST PERSONA>
	You are <FIRST>. Using the above %{persona\|traits\|character sheet\|character definitions} for <FIRST>, you must engage in %{a roleplay conversation\|an RP chat} with <SECOND>. %{Keep the following scenario in mind\|Remember this scenario\|Pay attention to this scenario}:
	<SCENARIO>""",
	]

	single_char_system_prompts = [
	"""Enter roleplay mode. %{You are\|Play the role of\|Take the role of\|Become the character} <FIRST> with the following %{persona\|definitions\|character sheet\|traits}:
	<FIRST PERSONA>
	%{In addition\|Additionally\|Also}, %{keep the following scenario in mind\|remember this scenario\|pay attention to this scenario}:
	<SCENARIO>""",

	"""<FIRST>'s Persona: <FIRST PERSONA>
	You are <FIRST>. Using the above %{persona\|traits\|character sheet\|character definitions}, you must engage in %{a roleplay conversation\|an RP chat}. %{Keep the following scenario in mind\|Remember this scenario\|Pay attention to this scenario}:
	<SCENARIO>"""
	]

	# Use two-character or one-character system prompts
	use_two_char_sys = True

	# -1 = both prompts, 0 = system prompt A, 1 = system prompt B
	system_prompt_index = 1

	tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_path)


	def substitute_participants(input_string):
	'''
	Replace placeholder usernames with different names
	TODO: the implementation could be improved by not relying on external variables.
	'''
	#if is_system:
	input_string = input_string.replace("<FIRST PERSONA>", source['persona']['<FIRST>'])
	input_string = input_string.replace("<SECOND PERSONA>", source['persona']['<SECOND>'])
	input_string = input_string.replace("<SCENARIO>", source['scenario'])
	input_string = input_string.replace("<SECOND>", placeholder_user)
	input_string = input_string.replace("<FIRST>", placeholder_bot)
	#if is_system:
	#print(input_string)

	return input_string


	def fix_punctuation(input_string):
	'''
	Replace fancy/incorrect punctuation with simpler/correct one
	TODO: more effective regexes, options for controlling what should be changed.
	'''

	# Fix excessive horizontal whitespace. This should go before everything else.
	input_string = re.sub(r' {2,}', ' ', input_string)

	# General puncuation fixes
	input_string = input_string.replace(' !', '!')
	input_string = input_string.replace(' ?', '?')
	input_string = input_string.replace('’', "'")
	input_string = input_string.replace('‘', "'")
	input_string = input_string.replace('“', '"')
	input_string = input_string.replace('”', '"')
	input_string = input_string.replace('…', '...')

	# Replace em-dash surrogates `---` in the source files with actual
	# em-dashes, since some people apparently dislike them.
	input_string = input_string.replace('---', '—')

	# Fix incorrect ellipsis. This should preferably be fixed in the
	# source files themselves
	input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string)
	input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string)

	return input_string


	def detect_single_newlines(input_string):
	'''
	Single newlines are most of the time unwanted
	'''

	# Removing double newlines first
	input_string = input_string.replace('\n\n', '\|\|')

	# Now detect single newline presence
	if input_string.find('\n') >= 0:
	return True
	else:
	return False


	def validate_placeholders(input_string, placeholder_list):
	'''
	Verifies that placeholder (baked in the files) have been used correctly.
	They must not be open and not have typos.
	'''
	search_placeholders = re.findall('<.*?>', input_string)
	if search_placeholders:
	# If the variable is nonzero, there are strings detected as placeholders.
	for item in search_placeholders:
	# Compare the strings to the placeholders. They must be the same.
	if item not in placeholder_list:
	return False

	# A naive check is counting the number of '<' and '>'. They must match, although
	# this means that these symbols cannot be used for other things, which was already
	# the assumption anyway.
	str_lt = input_string.count('<')
	str_gt = input_string.count('>')
	if str_lt != str_gt:
	return False

	# All checks passed
	return True

	def get_prompt(prompt_string):
	pattern = re.compile(r'%{(.+?)}')

	for m in re.finditer(pattern, prompt_string):
	match = m.group(0)
	replace = random.choice(m.group(1).split("\|"))
	prompt_string = prompt_string.replace(match, replace)

	return prompt_string

	message_count = 0
	entries = []
	token_lengths = []
	count = 0
	for num, file in enumerate(files):
	with open(file, 'r', encoding='utf-8') as f:
	source = yaml.safe_load(f)

	if use_original_names:
	label_user = source['names']['<SECOND>']
	label_bot = source['names']['<FIRST>']
	placeholder_user = source['names']['<SECOND>']
	placeholder_bot = source['names']['<FIRST>']

	if label_user is None:
	raise NameError(f"{file}: USER must have a name.")
	elif label_bot is None:
	raise NameError(f"{file}: CHAR must have a name.")
	elif (3 > len(label_user) > 20) or (3 > len(label_bot) > 20):
	raise NameError(f"{file}: unusual character name length: possible issue.")
	elif ('SECOND' in source['names']['<SECOND>']) or ('FIRST' in source['names']['<FIRST>']):
	# No need to check for the exact string here, just if there's a clearly
	# defective name that likely resulted from user error while manually
	# processing the files.
	raise NameError(f"{file}: Incorrect character names.")
	elif (source['persona']['<FIRST>'] is None) or (len(source['persona']['<FIRST>']) < 20):
	raise SyntaxError(f'{file}: no persona defined for <FIRST>.')
	elif (source['persona']['<SECOND>'] is None) or (len(source['persona']['<SECOND>']) < 20):
	raise SyntaxError(f'{file}: no persona defined for <SECOND>.')
	elif source['scenario'] is None:
	raise SyntaxError(f'{file}: Scenario missing.')
	elif len(source['scenario']) < 100:
	raise SyntaxError(f'{file}: Probable error in scenario (too short).')

	# Perform various per-message checks
	for message in source['conversation']:
	# Check if there are open (unpaired) quotation marks
	quotation_marks = 0
	quotation_marks += message['text'].count('"')
	if (quotation_marks % 2) != 0:
	print((f'\n{file}: Open quotation marks\n{message["text"]}\n\n'))

	# Check if there are unpaired asterisks (for inner thoughts, etc)
	paired_asterisks = 0
	paired_asterisks += message['text'].count('*')
	if (paired_asterisks % 2) != 0:
	print((f'\n{file}: Unpaired asterisks\n{message["text"]}\n\n'))

	# Check for single newlines
	if detect_single_newlines(message['text']):
	print((f'\n{file}: Single newline\n{message["text"]}\n\n'))

	# Check if placeholders have been used correctly in the messages
	if not validate_placeholders(message['text'], [placeholder_user_old, placeholder_bot_old]):
	print((f'\n{file}: Incorrect placeholder\n{message["text"]}\n\n'))

	# Storing the length of BOT's messages in words.
	message_lengths = []
	if message['name'] == '<FIRST>':
	message_lengths.append(len(message['text'].split()))

	# Check if placeholders have been used correctly in personas and scenarios
	if not validate_placeholders(source['scenario'], [placeholder_user_old, placeholder_bot_old]):
	print((f'\n{file}: Incorrect placeholder in scenario (or: < > not allowed)\n{source["scenario"]}\n\n'))
	if not validate_placeholders(source['persona']['<FIRST>'], [placeholder_user_old, placeholder_bot_old]):
	print((f'\n{file}: Incorrect placeholder in <FIRST> Persona (or: < > not allowed)\n{source["persona"]["<FIRST>"]}\n\n'))
	if not validate_placeholders(source["persona"]["<SECOND>"], [placeholder_user_old, placeholder_bot_old]):
	print((f'\n{file}: Incorrect placeholder in <SECOND> Persona (or: < > not allowed)\n{source["persona"]["<SECOND>"]}\n\n'))

	# This section tries to limit the total number of token below a predefined limit.
	# It operates on a message basis. The current algorithm slows down parsing as
	# it repeats tokenization - it was made quickly to test the idea.
	if limit_data:
	# First = GPT, Second = Human
	tokens_persona_1 = len(tokenizer(source['persona']['<FIRST>'])['input_ids'])
	tokens_persona_2 = len(tokenizer(source['persona']['<SECOND>'])['input_ids'])
	tokens_scenario = len(tokenizer(source['scenario'])['input_ids'])
	tokens_extra = 20 + 45 # Extra tokens to take into account newlines and and preamble
	tokens_header = tokens_persona_1 + tokens_persona_2 + tokens_scenario + tokens_extra


	tokens_messages = []
	total_tokens = tokens_header
	for message in source['conversation']:
	message_tokens = 0
	message_tokens += 16 # Worst-case scenario for newlines, system sequences and character names
	message_tokens += len(tokenizer(message['text'])['input_ids'])
	tokens_messages.append(message_tokens)

	# Naive search for the smallest subset of messages whose token length
	# summed with that of the header is lower than the limit.
	for i_start in range(0, len(source['conversation']) - 1):
	totaltokens = tokens_header + sum(tokens_messages[i_start:])
	if totaltokens < limit_data_length:
	break

	if totaltokens > limit_data_length:
	# Drop the messages if they exceed the threshold
	print(f'[{num+1}]:DROP', end=' ')
	continue

	else:
	# To be used as a starting index for the next big loop
	i_start = 0


	sharegpt_text = {
	"roles": [label_user, label_bot],
	"conversations": []
	}
	current_user = None
	previous_user = None
	if not limit_data:
	assert i_start == 0

	# Adding messages for composing the input (prompt and chat history). Make
	# sure to skip the last message, because that's to be used for the output.
	# Unless reducing message length, i_start must be zero.
	for message in source['conversation']:
	message_count += 1
	current_user = message['name']
	message_text = ""

	if previous_user == current_user:
	# Message usernames must alternate. Error out if they don't, as this indicates a problem
	raise ValueError(f'{file}: Consecutive messages from the same person or incorrect indentation.')

	if len(message['text']) < 25:
	# An arbitrarily low threshold that should include most copy/paste errors.
	raise ValueError(f'{file}: Unusually short message in the conversation.')

	message_text = message["text"].strip()
	message_text = substitute_participants(message_text)
	message_text = fix_punctuation(message_text)
	sharegpt_text["conversations"].append({
	"from": "gpt" if message["name"] == "<FIRST>" else "human",
	"value": re.sub(r'[\n]+', '', message_text)
	})

	previous_user = current_user

	average_message_length = statistics.mean(message_lengths)

	header = []
	header.append(f'{seq_system}')
	header.append(f"{placeholder_bot}'s Persona: {source['persona']['<FIRST>']}\n")
	header.append(f"{placeholder_user}'s Persona: {source['persona']['<SECOND>']}\n")
	header.append(f"Scenario: {source['scenario']}\n")

	selected_prompt_array = two_char_system_prompts if use_two_char_sys else single_char_system_prompts
	prompt_string = random.choice(selected_prompt_array) if system_prompt_index == -1 else selected_prompt_array[system_prompt_index]
	prompt_string = substitute_participants(prompt_string)
	prompt_string = fix_punctuation(prompt_string)
	sharegpt_text['conversations'].insert(0, {
	"from": "system",
	"value": get_prompt(prompt_string)
	})

	# XXX: this must be taken elsewhere as it repeats the tokenization performed
	# initially to limit message length.
	# len_total = len(tokenizer(header + conversation_text)['input_ids'])

	# Since the limiting process has some wiggle room, make sure that the
	# final data doesn't exceed the length limit
	# if len_total > limit_data_length:
	# raise OverflowError(f"{file} exceeds the context length limit")

	# token_lengths.append(len_total)

	entry = sharegpt_text
	entries.append(entry)
	# break
	# print(f'[{num+1}]:{text_average_message_length}:{len_total}', end=' ')

	print(f"\n\nTotal conversations: {len(entries)}\nTotal messages: {message_count}\n")
	# f"Longest sequence length: {max(token_lengths)} tokens\n"
	# f"Mean sequence length: {statistics.mean(token_lengths):.1f} tokens\n"
	# f"Total training tokens: {sum(token_lengths):,} tokens")

	# Create a dataframe and shuffle it, resetting the index.
	df = pandas.DataFrame(entries)
	df = df.sample(frac=1).reset_index(drop=True)
	df.to_json(outfile, orient='records', lines=True)