iwalton3/data-format.json

## data-format.json
{
    "data": "%data%"
}

## dataset-chat-fmt.py
#!/usr/bin/env python3
import json
import datetime
import re

# Licensed under the MIT License.
# Usage is unlimited, but please be weary of the consequences of training GPTs on human messages.
# Attempting to mimic a human personality via GPT could be very harmful depending on the context.

# To use, specify the values below and run. You can then train the resulting file
# in text-generation-webui as a LoRA using the data-format.json format file.

# messages must be within 5 minutes of each other
chunk_time = 60*5

# minimum number of messages in a chunk, or block is thrown out
# I would suggest changing this to 5 for real data
min_messages = 3

# if a thread is longer than this, split into overlapping blocks
block_size = 10

input_file = 'example-chat-fmt.json'
output_file = 'dataset-chat-split.json'

# phrases or words in these lists will throw out the respective block
# AND cause a message thread split
disallowed_phrases = []
disallowed_words = []

word_regex = re.compile(r'([a-zA-Z_]+)')
half_block_size = block_size // 2

with open(output_file,'w') as out:
    message_chunks = []

    inp = json.load(open(input_file))
    for thread in inp:
        messages = []
        last_ts = None
        flag_split = False
        for msg in thread['data']:
            lowercase_content = msg['message'].lower()
            should_skip = False
            for phrase in disallowed_phrases:
                if phrase in lowercase_content:
                    should_skip = True
                    flag_split = True
                    continue

            for word in word_regex.findall(lowercase_content):
                if word in disallowed_words:
                    should_skip = True
                    flag_split = True
                    continue

            if should_skip:
                continue

            message_ts = datetime.datetime.fromisoformat(msg['timestamp']).timestamp()
            if last_ts is not None and message_ts - last_ts > chunk_time or flag_split:
                flag_split = False
                if len(messages) > min_messages:
                    message_chunks.append(messages)
                messages = []
            elif len(messages) >= block_size:
                message_chunks.append(messages)
                messages = messages[half_block_size:]
            last_ts = message_ts

            messages.append(msg)

        if len(messages) >= min_messages:
            message_chunks.append(messages)

    training_data = []
    for chunk in message_chunks:
        last_message = None
        for message in chunk:
            if last_message is not None and message['user'] != last_message['user']:
                last_message['message'] += '<!end!>'
            last_message = message
        last_message['message'] += '<!end!>'

        current_author = None
        for message in chunk:
            if current_author is None or message['user'] != current_author:
                message['include_author'] = True
            else:
                message['include_author'] = False
            current_author = message['user']

        text = '\n'.join(f"{msg['user']}: {msg['message']}" if msg['include_author'] else msg['user'] for msg in chunk)
        text = text.replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>')
        training_data.append({"data": text})
    print("Created", len(training_data), "training data chunks")
    json.dump(training_data, out)

## example-chat-fmt.json
[
    {
        "data": [
            {
                "timestamp": "2023-05-11T12:00:00Z",
                "user": "Alice",
                "message": "Hey, how's it going?"
            },
            {
                "timestamp": "2023-05-11T12:01:00Z",
                "user": "Bob",
                "message": "Pretty good! Just working on that project due next week."
            },
            {
                "timestamp": "2023-05-11T12:04:00Z",
                "user": "Alice",
                "message": "Oh, right. How's that coming along?"
            }
        ]
    },
    {
        "data": [
            {
                "timestamp": "2023-05-11T12:10:00Z",
                "user": "Bob",
                "message": "I'm having some issues with the calculations."
            },
            {
                "timestamp": "2023-05-11T12:11:00Z",
                "user": "Alice",
                "message": "I can help with that, if you want."
            },
            {
                "timestamp": "2023-05-11T12:13:00Z",
                "user": "Bob",
                "message": "That would be great, thanks!"
            }
        ]
    }
]
	#!/usr/bin/env python3
	import json
	import datetime
	import re

	# Licensed under the MIT License.
	# Usage is unlimited, but please be weary of the consequences of training GPTs on human messages.
	# Attempting to mimic a human personality via GPT could be very harmful depending on the context.

	# To use, specify the values below and run. You can then train the resulting file
	# in text-generation-webui as a LoRA using the data-format.json format file.

	# messages must be within 5 minutes of each other
	chunk_time = 60*5

	# minimum number of messages in a chunk, or block is thrown out
	# I would suggest changing this to 5 for real data
	min_messages = 3

	# if a thread is longer than this, split into overlapping blocks
	block_size = 10

	input_file = 'example-chat-fmt.json'
	output_file = 'dataset-chat-split.json'

	# phrases or words in these lists will throw out the respective block
	# AND cause a message thread split
	disallowed_phrases = []
	disallowed_words = []

	word_regex = re.compile(r'([a-zA-Z_]+)')
	half_block_size = block_size // 2

	with open(output_file,'w') as out:
	message_chunks = []

	inp = json.load(open(input_file))
	for thread in inp:
	messages = []
	last_ts = None
	flag_split = False
	for msg in thread['data']:
	lowercase_content = msg['message'].lower()
	should_skip = False
	for phrase in disallowed_phrases:
	if phrase in lowercase_content:
	should_skip = True
	flag_split = True
	continue

	for word in word_regex.findall(lowercase_content):
	if word in disallowed_words:
	should_skip = True
	flag_split = True
	continue

	if should_skip:
	continue

	message_ts = datetime.datetime.fromisoformat(msg['timestamp']).timestamp()
	if last_ts is not None and message_ts - last_ts > chunk_time or flag_split:
	flag_split = False
	if len(messages) > min_messages:
	message_chunks.append(messages)
	messages = []
	elif len(messages) >= block_size:
	message_chunks.append(messages)
	messages = messages[half_block_size:]
	last_ts = message_ts

	messages.append(msg)

	if len(messages) >= min_messages:
	message_chunks.append(messages)

	training_data = []
	for chunk in message_chunks:
	last_message = None
	for message in chunk:
	if last_message is not None and message['user'] != last_message['user']:
	last_message['message'] += '<!end!>'
	last_message = message
	last_message['message'] += '<!end!>'

	current_author = None
	for message in chunk:
	if current_author is None or message['user'] != current_author:
	message['include_author'] = True
	else:
	message['include_author'] = False
	current_author = message['user']

	text = '\n'.join(f"{msg['user']}: {msg['message']}" if msg['include_author'] else msg['user'] for msg in chunk)
	text = text.replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>')
	training_data.append({"data": text})
	print("Created", len(training_data), "training data chunks")
	json.dump(training_data, out)
	[
	{
	"data": [
	{
	"timestamp": "2023-05-11T12:00:00Z",
	"user": "Alice",
	"message": "Hey, how's it going?"
	},
	{
	"timestamp": "2023-05-11T12:01:00Z",
	"user": "Bob",
	"message": "Pretty good! Just working on that project due next week."
	},
	{
	"timestamp": "2023-05-11T12:04:00Z",
	"user": "Alice",
	"message": "Oh, right. How's that coming along?"
	}
	]
	},
	{
	"data": [
	{
	"timestamp": "2023-05-11T12:10:00Z",
	"user": "Bob",
	"message": "I'm having some issues with the calculations."
	},
	{
	"timestamp": "2023-05-11T12:11:00Z",
	"user": "Alice",
	"message": "I can help with that, if you want."
	},
	{
	"timestamp": "2023-05-11T12:13:00Z",
	"user": "Bob",
	"message": "That would be great, thanks!"
	}
	]
	}
	]