Skip to content

Instantly share code, notes, and snippets.

@bramses

bramses/chunk.ts Secret

Created Jan 4, 2022
Embed
What would you like to do?
chunk
/**
* a chunking algorithm that splits a string into chunks of a maximum length as dictated by max tokens allowed by the API
* Does not respect spaces, so it is not suitable for splitting text into sentences
* @param document document to be chunked
* @returns chunks of the document
*/
export function chunkDocument (document: string): string[] {
const chunks: string[] = []
let chunksAboveTokenLimit = [] // [true, false, etc] want all to be false
let numOfSubdivisions = 0
chunksAboveTokenLimit.push(checkLength(document))
while (chunksAboveTokenLimit.includes(true)) {
numOfSubdivisions++
chunksAboveTokenLimit = []
let maxChunkLength = Math.floor(document.length / numOfSubdivisions)
for (let i = 0; i < numOfSubdivisions; i++) {
const chunk = document.slice(i * maxChunkLength, (i + 1) * maxChunkLength)
chunksAboveTokenLimit.push(checkLength(chunk))
}
}
if(numOfSubdivisions > 1) {
let maxChunkLength = Math.floor(document.length / numOfSubdivisions)
for (let i = 0; i < numOfSubdivisions; i++) {
const chunk = document.slice(i * maxChunkLength, (i + 1) * maxChunkLength)
chunks.push(chunk)
}
} else {
chunks.push(document)
}
return chunks
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment