shawngraham/preprocess.sh

## preprocess.sh
#!/bin/bash

input_folder="articles" # Replace with your input directory path
output_folder="processed_articles" # Replace with your output directory path

# Create the output directory if it does not exist
mkdir -p "$output_folder"

# Set target token count and the acceptable range (20% of 100,000) 4t has 128k window
target_token_count=100000
lower_limit=$((target_token_count - target_token_count / 5))
upper_limit=$((target_token_count + target_token_count / 5))

# Initialize variables
current_token_count=0
joint_file_content=""
output_file_index=1

# Loop through all text files in the input folder
for file in "$input_folder"/*.csv; do   #or .txt
    # Skip if the directory is empty
    [[ -e $file ]] || break

    # Use ttok to count the tokens in the current file
    token_count=$(cat "$file" | ttok )

    # Check if adding this file would exceed the upper limit
    if (( current_token_count + token_count > upper_limit )); then
        # Write the current joint file content to the output folder
        echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt"
        # Increment for the next output file name
        ((output_file_index++))
        # Reset variables
        joint_file_content=""
        current_token_count=0
    fi

    # Add the current file's tokens to the count
    current_token_count=$((current_token_count + token_count))

    # Append the file's content to the joint file content string
    joint_file_content+=$(cat "$file")

    # Add a newline just to separate file contents clearly
    joint_file_content+=$'\n'
done

# Check for the last set of files that may not reach the target count
if (( current_token_count > 0 )); then
    # Write the remaining file content to the final output file
    echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt"
fi

echo "Files processed and joined under '$output_folder'."
	#!/bin/bash

	input_folder="articles" # Replace with your input directory path
	output_folder="processed_articles" # Replace with your output directory path

	# Create the output directory if it does not exist
	mkdir -p "$output_folder"

	# Set target token count and the acceptable range (20% of 100,000) 4t has 128k window
	target_token_count=100000
	lower_limit=$((target_token_count - target_token_count / 5))
	upper_limit=$((target_token_count + target_token_count / 5))

	# Initialize variables
	current_token_count=0
	joint_file_content=""
	output_file_index=1

	# Loop through all text files in the input folder
	for file in "$input_folder"/*.csv; do #or .txt
	# Skip if the directory is empty
	[[ -e $file ]] \|\| break

	# Use ttok to count the tokens in the current file
	token_count=$(cat "$file" \| ttok )

	# Check if adding this file would exceed the upper limit
	if (( current_token_count + token_count > upper_limit )); then
	# Write the current joint file content to the output folder
	echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt"
	# Increment for the next output file name
	((output_file_index++))
	# Reset variables
	joint_file_content=""
	current_token_count=0
	fi

	# Add the current file's tokens to the count
	current_token_count=$((current_token_count + token_count))

	# Append the file's content to the joint file content string
	joint_file_content+=$(cat "$file")

	# Add a newline just to separate file contents clearly
	joint_file_content+=$'\n'
	done

	# Check for the last set of files that may not reach the target count
	if (( current_token_count > 0 )); then
	# Write the remaining file content to the final output file
	echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt"
	fi

	echo "Files processed and joined under '$output_folder'."