Skip to content

Instantly share code, notes, and snippets.

@kindziora
Last active October 12, 2021 14:01
Show Gist options
  • Save kindziora/f641c4f9a4958fea91ed9c7ee81a9e0e to your computer and use it in GitHub Desktop.
Save kindziora/f641c4f9a4958fea91ed9c7ee81a9e0e to your computer and use it in GitHub Desktop.
chunk a string into max length segments but keep words healthy
/**
* chunk and dont be strict with length
* @param {*} text
* @param {*} chunkSize
* @returns
*/
function chunk(text, chunkSize) {
let result = [];
let words = text.split(" ");
let segment = "";
for (let i in words) {
segment += words[i] + " ";
if(segment.length > chunkSize) {
let chunk = segment.split(" ");
let last_word = chunk.pop();
result.push( chunk.join(" ") );
segment = last_word + " ";
}
}
return result;
}
let anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker";
console.log(chunk(anakin, 15));
//chunk in strictly framed segments but keep words healthy
function createWordOffsets(text, chunkSize) {
let indexed = [];
let words = text.split(" ");
let blockIndex = 0;
let blockLength = { 0: 0 };
let padding = 0;
for (let i in words) {
let start = words.slice(0, i).join(" ").length;
let end = start + words[i].length;
blockIndex = (end + padding) - (end % chunkSize);
if (typeof blockLength[blockIndex] === "undefined") blockLength[blockIndex] = 0;
if (blockLength[blockIndex] + words[i].length +1 > chunkSize) {
padding+=chunkSize;
blockIndex = (end + padding) - (end % chunkSize);
}
blockLength[blockIndex] += words[i].length +1 ;
indexed.push({ start, end, segment: words[i], blockIndex, "blockLength": blockLength[blockIndex] });
}
return indexed;
}
function chunk(words, chunkSize) {
let result = [];
for (let i = 0; i < words[words.length - 1].end; i += chunkSize) {
result.push(words.filter((w) => w.blockIndex == i).map(e => e.segment).join(" "));
}
return result;
}
let anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker";
let chunksize = 16;
let w = createWordOffsets(anakin, chunksize);
console.log(w);
let allchunks = chunk(w, chunksize);
console.log(allchunks, allchunks.map(e =>e.length));
def createWordOffsets(text, chunkSize):
indexed = []
words = text.split(" ")
blockIndex = 0
blockLength = { 0: 0 }
padding = 0
for i, word in enumerate(words):
start = len(" ".join(words[0: i]))
end = start + len(words[i])
blockIndex = (end + padding) - (end % chunkSize)
if blockIndex not in blockLength : blockLength[blockIndex] = 0
if blockLength[blockIndex] + len(words[i]) +1 > chunkSize:
padding+=chunkSize
blockIndex = (end + padding) - (end % chunkSize)
if blockIndex not in blockLength : blockLength[blockIndex] = 0
blockLength[blockIndex] += len(words[i]) +1
p = { "start" :start, "end" : end, "segment" :"", "blockIndex" : blockIndex , "blockLength" :""}
p["segment"] = words[i]
p["blockLength"] = blockLength[blockIndex]
indexed.append(p)
return indexed
def chunk(words, chunkSize):
result = []
for i in range(0, words[len(words) - 1]["end"], chunkSize):
result.append(" ".join(map(lambda e: e["segment"], filter(lambda w: w["blockIndex"] == i, words))))
return result
anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker"
chunksize = 20
w = createWordOffsets(anakin, chunksize)
print(w)
allchunks = chunk(w, chunksize)
print(allchunks)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment