Skip to content

Instantly share code, notes, and snippets.

@peckjon
Created May 11, 2023 00:19
Show Gist options
  • Save peckjon/e0ecf60037ac54e69636e11b039c5091 to your computer and use it in GitHub Desktop.
Save peckjon/e0ecf60037ac54e69636e11b039c5091 to your computer and use it in GitHub Desktop.
Python script to split an input file into multiple files no longer than 3900 words each, breaking only on lines that start with the pattern "00;"
import os
import sys
# Set the maximum number of words per file
max_words = 3900
# Set the pattern to split on
pattern = "00;"
# Get the input file name from the command line arguments
if len(sys.argv) < 2:
print("Usage: python split_file.py <input_file>")
sys.exit(1)
input_file = sys.argv[1]
# Initialize the word count and file count
word_count = 0
file_count = 1
# Create the output directory if it doesn't exist
os.makedirs("output", exist_ok=True)
# Open the input file
with open(input_file, "r") as f:
# Loop through the input file
for line in f:
# Check if the line starts with the pattern
if line.startswith(pattern):
# Check if the word count is greater than the maximum
if word_count >= max_words:
# Increment the file count
file_count += 1
# Reset the word count
word_count = 0
# Open the current output file
with open(f"output/output{file_count}.txt", "a") as out_file:
# Write the line to the current output file
out_file.write(line)
# Increment the word count
word_count += len(line.split())
# Trim empty lines off the start and end of the output
for i in range(1, file_count + 1):
with open(f"output/output{i}.txt", "r") as in_file:
lines = in_file.readlines()
# Trim empty lines off the start of the output
while len(lines) > 0 and lines[0].strip() == "":
lines.pop(0)
# Trim empty lines off the end of the output
while len(lines) > 0 and lines[-1].strip() == "":
lines.pop()
with open(f"output/output{i}.txt", "w") as out_file:
out_file.writelines(lines)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment