Skip to content

Instantly share code, notes, and snippets.

@ScotterC
Created February 28, 2023 16:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ScotterC/b0ea4f53bd588bf2e9834c7ef4e5bdda to your computer and use it in GitHub Desktop.
Save ScotterC/b0ea4f53bd588bf2e9834c7ef4e5bdda to your computer and use it in GitHub Desktop.
Chunk up text using lanchain's text splitters
#!/usr/bin/env python3
# Note: if spacy isn't working you may need to download english model python -m spacy download en
import argparse
import json
from langchain.text_splitter import NLTKTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter, SpacyTextSplitter, TokenTextSplitter
def chunkify(text, chunk_size, chunk_overlap, method):
if method == 'nltk':
text_splitter = NLTKTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif method == 'character':
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif method == 'recursive':
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif method == 'spacy':
text_splitter = SpacyTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif method == 'token':
text_splitter = TokenTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
else:
raise ValueError(f"Invalid method: {method}")
chunks = text_splitter.split_text(text)
return {'chunks': chunks}
if __name__ == '__main__':
# Set up command line argument parser.
parser = argparse.ArgumentParser(description='Chunkify a long string of text.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--text', type=str, help='the long string of text to be chunked up')
group.add_argument('--input-file', type=str, help='the file containing the text to be chunked up')
parser.add_argument('--output-file', type=str, required=True, help='the file to write the chunked output to')
parser.add_argument('--chunk-size', type=int, default=600, help='the size of each chunk')
parser.add_argument('--chunk-overlap', type=int, default=100, help='the overlap between each chunk')
parser.add_argument('--method', type=str, default='nltk', help='the method to use for chunking up the text')
args = parser.parse_args()
# Get the input text.
if args.text:
text = args.text
else:
with open(args.input_file, 'r') as f:
text = f.read()
# Chunkify the text.
result = chunkify(text, args.chunk_size, args.chunk_overlap, args.method)
# Write the output to a file.
with open(args.output_file, 'w') as f:
json.dump(result, f, indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment