Created
November 8, 2023 16:24
-
-
Save shawngraham/47abeddba92c68bc2fc03397f4d0beb6 to your computer and use it in GitHub Desktop.
use ttok to count tokens in files in a folder to join files together in order to reduce the number of calls to gpt4-turbo. The idea is that we use ttok to work out how many tokens are in each file, comparing against an upper limit. If we're beneath the limit, we join together. If we're within 20% of the limit, we stop, write to file, then start …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
input_folder="articles" # Replace with your input directory path | |
output_folder="processed_articles" # Replace with your output directory path | |
# Create the output directory if it does not exist | |
mkdir -p "$output_folder" | |
# Set target token count and the acceptable range (20% of 100,000) 4t has 128k window | |
target_token_count=100000 | |
lower_limit=$((target_token_count - target_token_count / 5)) | |
upper_limit=$((target_token_count + target_token_count / 5)) | |
# Initialize variables | |
current_token_count=0 | |
joint_file_content="" | |
output_file_index=1 | |
# Loop through all text files in the input folder | |
for file in "$input_folder"/*.csv; do #or .txt | |
# Skip if the directory is empty | |
[[ -e $file ]] || break | |
# Use ttok to count the tokens in the current file | |
token_count=$(cat "$file" | ttok ) | |
# Check if adding this file would exceed the upper limit | |
if (( current_token_count + token_count > upper_limit )); then | |
# Write the current joint file content to the output folder | |
echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt" | |
# Increment for the next output file name | |
((output_file_index++)) | |
# Reset variables | |
joint_file_content="" | |
current_token_count=0 | |
fi | |
# Add the current file's tokens to the count | |
current_token_count=$((current_token_count + token_count)) | |
# Append the file's content to the joint file content string | |
joint_file_content+=$(cat "$file") | |
# Add a newline just to separate file contents clearly | |
joint_file_content+=$'\n' | |
done | |
# Check for the last set of files that may not reach the target count | |
if (( current_token_count > 0 )); then | |
# Write the remaining file content to the final output file | |
echo -n "$joint_file_content" > "$output_folder/joined_${output_file_index}.txt" | |
fi | |
echo "Files processed and joined under '$output_folder'." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment