Skip to content

Instantly share code, notes, and snippets.

@bramses

bramses/chunk.ts Secret

Created January 4, 2022 23:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bramses/4bde47da4cc5b781595e3a424bee82ba to your computer and use it in GitHub Desktop.
Save bramses/4bde47da4cc5b781595e3a424bee82ba to your computer and use it in GitHub Desktop.
chunk
/**
* a chunking algorithm that splits a string into chunks of a maximum length as dictated by max tokens allowed by the API
* Does not respect spaces, so it is not suitable for splitting text into sentences
* @param document document to be chunked
* @returns chunks of the document
*/
export function chunkDocument (document: string): string[] {
const chunks: string[] = []
let chunksAboveTokenLimit = [] // [true, false, etc] want all to be false
let numOfSubdivisions = 0
chunksAboveTokenLimit.push(checkLength(document))
while (chunksAboveTokenLimit.includes(true)) {
numOfSubdivisions++
chunksAboveTokenLimit = []
let maxChunkLength = Math.floor(document.length / numOfSubdivisions)
for (let i = 0; i < numOfSubdivisions; i++) {
const chunk = document.slice(i * maxChunkLength, (i + 1) * maxChunkLength)
chunksAboveTokenLimit.push(checkLength(chunk))
}
}
if(numOfSubdivisions > 1) {
let maxChunkLength = Math.floor(document.length / numOfSubdivisions)
for (let i = 0; i < numOfSubdivisions; i++) {
const chunk = document.slice(i * maxChunkLength, (i + 1) * maxChunkLength)
chunks.push(chunk)
}
} else {
chunks.push(document)
}
return chunks
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment