Last active
March 2, 2023 11:39
-
-
Save AmrSaber/2468f546fb67dc31576a14e1209870e6 to your computer and use it in GitHub Desktop.
Meaningful text splitting by chunk size (in Golang)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"io" | |
"math" | |
"os" | |
"strings" | |
"time" | |
) | |
func main() { | |
var filePath string | |
var showChunks bool | |
var size int | |
flag.IntVar(&size, "size", 100, "chunk size") | |
flag.BoolVar(&showChunks, "show", false, "show chunks") | |
flag.StringVar(&filePath, "file", "", "file to read and chunk instead of the default sample") | |
flag.Parse() | |
sample := ` | |
This is some sentence. | |
And this is a sentence with a big word: supercalifragilisticexpialidocious. | |
And here are some split lines: | |
a b | |
x | |
x | |
bbb | |
ccc | |
` | |
if filePath != "" { | |
var bytes []byte | |
var err error | |
if filePath == "-" { | |
bytes, err = io.ReadAll(os.Stdin) | |
} else { | |
bytes, err = os.ReadFile(filePath) | |
} | |
if err != nil { | |
panic(err) | |
} | |
sample = string(bytes) | |
} | |
startTime := time.Now() | |
chunks := split(sample, size) | |
processingTime := time.Since(startTime) | |
fmt.Printf("%d chunks -- processed in %.3f ms\n", len(chunks), float32(processingTime.Microseconds())/1000) | |
if showChunks { | |
getDigits := func(x int) int { | |
return int(math.Log10(float64(x))) + 1 | |
} | |
pattern := fmt.Sprintf("%%0%dd [%%0%dd] | %%q\n", getDigits(len(chunks)), getDigits(size+1)) | |
for i, chunk := range chunks { | |
fmt.Printf(pattern, i+1, len(chunk), chunk) | |
} | |
} | |
} | |
func split(str string, size int) []string { | |
if size < 1 { | |
return nil | |
} | |
str = strings.TrimSpace(str) | |
start := 0 | |
chunks := make([]string, 0, len(str)/size) | |
for start < len(str) { | |
end := start + size | |
if end >= len(str) { | |
chunks = append(chunks, str[start:]) | |
break | |
} | |
// If the next character is a delimiter, take it | |
// this is to avoid adding "-" at the end when the next character is a delimiter anyway | |
next := str[end] | |
if next == ' ' || next == '\n' { | |
end++ | |
} | |
chunk := str[start:end] | |
cutWord := false | |
// Try to find a new line within the limit | |
length := strings.LastIndex(chunk, "\n") | |
// If no new line found, try to find a space | |
if length == -1 { | |
length = strings.LastIndex(chunk, " ") | |
// If no space found, then just split the text | |
if length == -1 { | |
length = len(chunk) - 1 // leave space for "-" character that will be appended | |
cutWord = true | |
} | |
} | |
chunk = chunk[:length] | |
start += length | |
if cutWord { | |
chunk += "-" | |
} else { | |
// Ignore the space that we stopped at | |
start++ | |
} | |
chunks = append(chunks, chunk) | |
} | |
return chunks | |
} |
The complexity analysis is as follows:
- Each iteration of the main for-loop, we search a string of size
s
, and worst case we do that twice. - After the searching, and by the end of each loop, we are left with the original string (of size
n
) minus some part of lengthl
(i.e. we found a delimiter at lengthl
).
This leads to the following recurrence: F(n) = s + F(n-l)
and solving this recurrence we get that F(n) = O(ns/l)
.
Using that info, the worst case is that l = 1
and s = n
(I am not sure how that would happen though) which would give O(n^2)
.
But on average l
is actually some fraction of s
(we find a delimiter far away in s
). Assuming that, on average, we find a delimiter half way through s
, then l = s/2
which gives us O(n)
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The function
split
splits the given string based on the provided size, but tries to do so with as little distortion as possible to the string. This is useful when the chunk size is the limit by you don't care much about the number of chunks (e.g. if you want to send some text to an API that has input limit), this will try to cut the string on new lines, then if not possible (no new lines found) try to cut it at spaces, then if not possible it will cut the string as a last resort.For each chunk it tries to cut it at the furthest new line, if not found then at the furthest space, if still no space is found it just trims the string. If a word is smaller than the chunk size, this algorithm guarantees that it won't be split.
You can run the code using
go run split-string.go
and optionally you can use the following flags:--show
to show the resulting chunks.--size [number]
to provide the chunk size (default is 100).--file [file-path]
to provide a text file to be processed instead of the sample text, can be-
to read from stdin.This algorithm runs in
O(n^2)
complexity in worst case, andO(n)
on the average case, wheren
is the length of the provided string.But it is pretty fast on average, it can process Moby-Dick novel (1.2 MB) with chunk size of 100 in under a millisecond.