Skip to content

Instantly share code, notes, and snippets.

@FlyingFathead
Last active December 24, 2023 14:21
Show Gist options
  • Save FlyingFathead/e85c8aec2139a7aec49e0f4c4c95af0e to your computer and use it in GitHub Desktop.
Save FlyingFathead/e85c8aec2139a7aec49e0f4c4c95af0e to your computer and use it in GitHub Desktop.
split an input text into x char segments on an emptyline; for ML/LLM purposes
# v0.03 // added preformat sanitizer for hyphenation and other text formatting
import shutil
import sys
import os
import re
# print term width horizontal line
def hz_line(character='-'):
terminal_width = shutil.get_terminal_size().columns
line = character * terminal_width
print(line)
# text preprocessing & sanitization
def sanitize_content(text):
# Remove hyphenation at the end of lines and join words
text = re.sub(r'-\s*\n', '', text)
# Replace multiple spaces (but not newlines) with a single space
text = re.sub(r'[^\S\n]+', ' ', text)
# Merge lines into paragraphs
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
return text
def split_text_for_translation(file_path, char_limit):
hz_line()
print(f"Current character split limit (from next empty line): {char_limit} characters.")
hz_line()
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Sanitize the text for hyphenation
content = sanitize_content(content)
sections = []
current_section = ""
for line in content.split('\n'):
if len(current_section) + len(line) < char_limit or not line.strip():
current_section += line + '\n'
else:
sections.append(current_section)
current_section = line + '\n'
if current_section:
sections.append(current_section)
return sections
def main(file_path, char_limit):
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_dir = f"{base_name}-splits"
os.makedirs(output_dir, exist_ok=True)
sections = split_text_for_translation(file_path, char_limit)
num_digits = len(str(len(sections))) # Calculate the number of digits for formatting
for i, section in enumerate(sections, start=1):
formatted_index = str(i).zfill(num_digits) # Pad the index with leading zeros
output_filename = os.path.join(output_dir, f"{base_name}_split_{formatted_index}.txt")
with open(output_filename, 'w', encoding='utf-8') as output_file:
output_file.write(section)
print(f"Section {formatted_index} written to {output_filename}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python scriptname.py <inputfile>")
sys.exit(1)
inputfile = sys.argv[1]
char_limit = 5000 # Set your character limit here
main(inputfile, char_limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment