FlyingFathead/splitter.py

## splitter.py
# v0.03 // added preformat sanitizer for hyphenation and other text formatting

import shutil
import sys
import os
import re

# print term width horizontal line
def hz_line(character='-'):
    terminal_width = shutil.get_terminal_size().columns
    line = character * terminal_width
    print(line)

# text preprocessing & sanitization
def sanitize_content(text):
    # Remove hyphenation at the end of lines and join words
    text = re.sub(r'-\s*\n', '', text)
    # Replace multiple spaces (but not newlines) with a single space
    text = re.sub(r'[^\S\n]+', ' ', text)
    # Merge lines into paragraphs
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    return text

def split_text_for_translation(file_path, char_limit):
    hz_line()
    print(f"Current character split limit (from next empty line): {char_limit} characters.")
    hz_line()

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Sanitize the text for hyphenation
    content = sanitize_content(content)

    sections = []
    current_section = ""
    for line in content.split('\n'):
        if len(current_section) + len(line) < char_limit or not line.strip():
            current_section += line + '\n'
        else:
            sections.append(current_section)
            current_section = line + '\n'
    if current_section:
        sections.append(current_section)

    return sections

def main(file_path, char_limit):
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_dir = f"{base_name}-splits"
    os.makedirs(output_dir, exist_ok=True)

    sections = split_text_for_translation(file_path, char_limit)
    num_digits = len(str(len(sections)))  # Calculate the number of digits for formatting

    for i, section in enumerate(sections, start=1):
        formatted_index = str(i).zfill(num_digits)  # Pad the index with leading zeros
        output_filename = os.path.join(output_dir, f"{base_name}_split_{formatted_index}.txt")
        with open(output_filename, 'w', encoding='utf-8') as output_file:
            output_file.write(section)
        print(f"Section {formatted_index} written to {output_filename}")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python scriptname.py <inputfile>")
        sys.exit(1)

    inputfile = sys.argv[1]
    char_limit = 5000  # Set your character limit here
    main(inputfile, char_limit)
	# v0.03 // added preformat sanitizer for hyphenation and other text formatting

	import shutil
	import sys
	import os
	import re

	# print term width horizontal line
	def hz_line(character='-'):
	terminal_width = shutil.get_terminal_size().columns
	line = character * terminal_width
	print(line)

	# text preprocessing & sanitization
	def sanitize_content(text):
	# Remove hyphenation at the end of lines and join words
	text = re.sub(r'-\s*\n', '', text)
	# Replace multiple spaces (but not newlines) with a single space
	text = re.sub(r'[^\S\n]+', ' ', text)
	# Merge lines into paragraphs
	text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
	return text

	def split_text_for_translation(file_path, char_limit):
	hz_line()
	print(f"Current character split limit (from next empty line): {char_limit} characters.")
	hz_line()

	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Sanitize the text for hyphenation
	content = sanitize_content(content)

	sections = []
	current_section = ""
	for line in content.split('\n'):
	if len(current_section) + len(line) < char_limit or not line.strip():
	current_section += line + '\n'
	else:
	sections.append(current_section)
	current_section = line + '\n'
	if current_section:
	sections.append(current_section)

	return sections

	def main(file_path, char_limit):
	base_name = os.path.splitext(os.path.basename(file_path))[0]
	output_dir = f"{base_name}-splits"
	os.makedirs(output_dir, exist_ok=True)

	sections = split_text_for_translation(file_path, char_limit)
	num_digits = len(str(len(sections))) # Calculate the number of digits for formatting

	for i, section in enumerate(sections, start=1):
	formatted_index = str(i).zfill(num_digits) # Pad the index with leading zeros
	output_filename = os.path.join(output_dir, f"{base_name}_split_{formatted_index}.txt")
	with open(output_filename, 'w', encoding='utf-8') as output_file:
	output_file.write(section)
	print(f"Section {formatted_index} written to {output_filename}")

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python scriptname.py <inputfile>")
	sys.exit(1)

	inputfile = sys.argv[1]
	char_limit = 5000 # Set your character limit here
	main(inputfile, char_limit)