Skip to content

Instantly share code, notes, and snippets.

@iwalton3
Created May 12, 2023 03:52
Show Gist options
  • Save iwalton3/b76d052e09b7ddec1ff5e4cc178f5713 to your computer and use it in GitHub Desktop.
Save iwalton3/b76d052e09b7ddec1ff5e4cc178f5713 to your computer and use it in GitHub Desktop.
Message Splitter - Split chat messages into chunks for training GPTs
{
"data": "%data%"
}
#!/usr/bin/env python3
import json
import datetime
import re
# Licensed under the MIT License.
# Usage is unlimited, but please be weary of the consequences of training GPTs on human messages.
# Attempting to mimic a human personality via GPT could be very harmful depending on the context.
# To use, specify the values below and run. You can then train the resulting file
# in text-generation-webui as a LoRA using the data-format.json format file.
# messages must be within 5 minutes of each other
chunk_time = 60*5
# minimum number of messages in a chunk, or block is thrown out
# I would suggest changing this to 5 for real data
min_messages = 3
# if a thread is longer than this, split into overlapping blocks
block_size = 10
input_file = 'example-chat-fmt.json'
output_file = 'dataset-chat-split.json'
# phrases or words in these lists will throw out the respective block
# AND cause a message thread split
disallowed_phrases = []
disallowed_words = []
word_regex = re.compile(r'([a-zA-Z_]+)')
half_block_size = block_size // 2
with open(output_file,'w') as out:
message_chunks = []
inp = json.load(open(input_file))
for thread in inp:
messages = []
last_ts = None
flag_split = False
for msg in thread['data']:
lowercase_content = msg['message'].lower()
should_skip = False
for phrase in disallowed_phrases:
if phrase in lowercase_content:
should_skip = True
flag_split = True
continue
for word in word_regex.findall(lowercase_content):
if word in disallowed_words:
should_skip = True
flag_split = True
continue
if should_skip:
continue
message_ts = datetime.datetime.fromisoformat(msg['timestamp']).timestamp()
if last_ts is not None and message_ts - last_ts > chunk_time or flag_split:
flag_split = False
if len(messages) > min_messages:
message_chunks.append(messages)
messages = []
elif len(messages) >= block_size:
message_chunks.append(messages)
messages = messages[half_block_size:]
last_ts = message_ts
messages.append(msg)
if len(messages) >= min_messages:
message_chunks.append(messages)
training_data = []
for chunk in message_chunks:
last_message = None
for message in chunk:
if last_message is not None and message['user'] != last_message['user']:
last_message['message'] += '<!end!>'
last_message = message
last_message['message'] += '<!end!>'
current_author = None
for message in chunk:
if current_author is None or message['user'] != current_author:
message['include_author'] = True
else:
message['include_author'] = False
current_author = message['user']
text = '\n'.join(f"{msg['user']}: {msg['message']}" if msg['include_author'] else msg['user'] for msg in chunk)
text = text.replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>').replace('<!end!><!end!>', '<!end!>')
training_data.append({"data": text})
print("Created", len(training_data), "training data chunks")
json.dump(training_data, out)
[
{
"data": [
{
"timestamp": "2023-05-11T12:00:00Z",
"user": "Alice",
"message": "Hey, how's it going?"
},
{
"timestamp": "2023-05-11T12:01:00Z",
"user": "Bob",
"message": "Pretty good! Just working on that project due next week."
},
{
"timestamp": "2023-05-11T12:04:00Z",
"user": "Alice",
"message": "Oh, right. How's that coming along?"
}
]
},
{
"data": [
{
"timestamp": "2023-05-11T12:10:00Z",
"user": "Bob",
"message": "I'm having some issues with the calculations."
},
{
"timestamp": "2023-05-11T12:11:00Z",
"user": "Alice",
"message": "I can help with that, if you want."
},
{
"timestamp": "2023-05-11T12:13:00Z",
"user": "Bob",
"message": "That would be great, thanks!"
}
]
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment