Skip to content

Instantly share code, notes, and snippets.

@standaloneSA
Created December 24, 2023 04:54
Show Gist options
  • Save standaloneSA/0ffdef6bd77661979eb791ba5c885944 to your computer and use it in GitHub Desktop.
Save standaloneSA/0ffdef6bd77661979eb791ba5c885944 to your computer and use it in GitHub Desktop.
Quick python script to break up a text file into training data
#!/usr/bin/env python3
import sys
import os
import json
import re
def break_text_into_segments(input_file, max_segment_length, overlap_length):
segments = []
with open(input_file, 'r') as file:
text = file.read()
segment_start = 0
while segment_start < len(text) - overlap_length:
segment_end = segment_start + max_segment_length
# Find the nearest whitespace character before the segment end
if segment_end < len(text):
while segment_end > segment_start and not text[segment_end].isspace():
segment_end -= 1
# Include the overlap_length characters before the end of the segment
segment_end += overlap_length
# Ensure we don't exceed the text length
if segment_end >= len(text):
segment_end = len(text)
segments.append(text[segment_start:segment_end])
segment_start = segment_end - overlap_length
print(f"Segment Start: {segment_start} and length is {len(text)}")
# Avoid infinite loop if a segment is longer than the remaining text
if segment_start >= len(text):
break
return segments
if __name__ == '__main__':
max_length = 512
overlap_length = 50
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <text-file>")
sys.exit(1)
segments = break_text_into_segments(sys.argv[1], max_length, overlap_length)
for s in segments:
print(json.dumps({"text": str(s).lstrip()}))
print("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment