Last active
March 2, 2023 11:39
-
-
Save AmrSaber/2468f546fb67dc31576a14e1209870e6 to your computer and use it in GitHub Desktop.
Meaningful text splitting by chunk size (in Golang)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"io" | |
"math" | |
"os" | |
"strings" | |
"time" | |
) | |
func main() { | |
var filePath string | |
var showChunks bool | |
var size int | |
flag.IntVar(&size, "size", 100, "chunk size") | |
flag.BoolVar(&showChunks, "show", false, "show chunks") | |
flag.StringVar(&filePath, "file", "", "file to read and chunk instead of the default sample") | |
flag.Parse() | |
sample := ` | |
This is some sentence. | |
And this is a sentence with a big word: supercalifragilisticexpialidocious. | |
And here are some split lines: | |
a b | |
x | |
x | |
bbb | |
ccc | |
` | |
if filePath != "" { | |
var bytes []byte | |
var err error | |
if filePath == "-" { | |
bytes, err = io.ReadAll(os.Stdin) | |
} else { | |
bytes, err = os.ReadFile(filePath) | |
} | |
if err != nil { | |
panic(err) | |
} | |
sample = string(bytes) | |
} | |
startTime := time.Now() | |
chunks := split(sample, size) | |
processingTime := time.Since(startTime) | |
fmt.Printf("%d chunks -- processed in %.3f ms\n", len(chunks), float32(processingTime.Microseconds())/1000) | |
if showChunks { | |
getDigits := func(x int) int { | |
return int(math.Log10(float64(x))) + 1 | |
} | |
pattern := fmt.Sprintf("%%0%dd [%%0%dd] | %%q\n", getDigits(len(chunks)), getDigits(size+1)) | |
for i, chunk := range chunks { | |
fmt.Printf(pattern, i+1, len(chunk), chunk) | |
} | |
} | |
} | |
func split(str string, size int) []string { | |
if size < 1 { | |
return nil | |
} | |
str = strings.TrimSpace(str) | |
start := 0 | |
chunks := make([]string, 0, len(str)/size) | |
for start < len(str) { | |
end := start + size | |
if end >= len(str) { | |
chunks = append(chunks, str[start:]) | |
break | |
} | |
// If the next character is a delimiter, take it | |
// this is to avoid adding "-" at the end when the next character is a delimiter anyway | |
next := str[end] | |
if next == ' ' || next == '\n' { | |
end++ | |
} | |
chunk := str[start:end] | |
cutWord := false | |
// Try to find a new line within the limit | |
length := strings.LastIndex(chunk, "\n") | |
// If no new line found, try to find a space | |
if length == -1 { | |
length = strings.LastIndex(chunk, " ") | |
// If no space found, then just split the text | |
if length == -1 { | |
length = len(chunk) - 1 // leave space for "-" character that will be appended | |
cutWord = true | |
} | |
} | |
chunk = chunk[:length] | |
start += length | |
if cutWord { | |
chunk += "-" | |
} else { | |
// Ignore the space that we stopped at | |
start++ | |
} | |
chunks = append(chunks, chunk) | |
} | |
return chunks | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The complexity analysis is as follows:
s
, and worst case we do that twice.n
) minus some part of lengthl
(i.e. we found a delimiter at lengthl
).This leads to the following recurrence:
F(n) = s + F(n-l)
and solving this recurrence we get thatF(n) = O(ns/l)
.Using that info, the worst case is that
l = 1
ands = n
(I am not sure how that would happen though) which would giveO(n^2)
.But on average
l
is actually some fraction ofs
(we find a delimiter far away ins
). Assuming that, on average, we find a delimiter half way throughs
, thenl = s/2
which gives usO(n)
.