AmrSaber/split-string.go

## split-string.go
package main

import (
	"flag"
	"fmt"
	"io"
	"math"
	"os"
	"strings"
	"time"
)

func main() {
	var filePath string
	var showChunks bool
	var size int

	flag.IntVar(&size, "size", 100, "chunk size")
	flag.BoolVar(&showChunks, "show", false, "show chunks")
	flag.StringVar(&filePath, "file", "", "file to read and chunk instead of the default sample")
	flag.Parse()

	sample := `
This is some sentence.
And this is a sentence with a big word: supercalifragilisticexpialidocious.

And here are some split lines:
a b
x
x
bbb
ccc
`

	if filePath != "" {
		var bytes []byte
		var err error

		if filePath == "-" {
			bytes, err = io.ReadAll(os.Stdin)
		} else {
			bytes, err = os.ReadFile(filePath)
		}

		if err != nil {
			panic(err)
		}

		sample = string(bytes)
	}

	startTime := time.Now()
	chunks := split(sample, size)
	processingTime := time.Since(startTime)

	fmt.Printf("%d chunks -- processed in %.3f ms\n", len(chunks), float32(processingTime.Microseconds())/1000)
	if showChunks {
		getDigits := func(x int) int {
			return int(math.Log10(float64(x))) + 1
		}

		pattern := fmt.Sprintf("%%0%dd [%%0%dd] | %%q\n", getDigits(len(chunks)), getDigits(size+1))
		for i, chunk := range chunks {
			fmt.Printf(pattern, i+1, len(chunk), chunk)
		}
	}
}

func split(str string, size int) []string {
	if size < 1 {
		return nil
	}

	str = strings.TrimSpace(str)

	start := 0
	chunks := make([]string, 0, len(str)/size)

	for start < len(str) {
		end := start + size
		if end >= len(str) {
			chunks = append(chunks, str[start:])
			break
		}

		// If the next character is a delimiter, take it
		// this is to avoid adding "-" at the end when the next character is a delimiter anyway
		next := str[end]
		if next == ' ' || next == '\n' {
			end++
		}

		chunk := str[start:end]
		cutWord := false

		// Try to find a new line within the limit
		length := strings.LastIndex(chunk, "\n")

		// If no new line found, try to find a space
		if length == -1 {
			length = strings.LastIndex(chunk, " ")

			// If no space found, then just split the text
			if length == -1 {
				length = len(chunk) - 1 // leave space for "-" character that will be appended
				cutWord = true
			}
		}

		chunk = chunk[:length]
		start += length
		if cutWord {
			chunk += "-"
		} else {
			// Ignore the space that we stopped at
			start++
		}

		chunks = append(chunks, chunk)
	}

	return chunks
}
	package main

	import (
	"flag"
	"fmt"
	"io"
	"math"
	"os"
	"strings"
	"time"
	)

	func main() {
	var filePath string
	var showChunks bool
	var size int

	flag.IntVar(&size, "size", 100, "chunk size")
	flag.BoolVar(&showChunks, "show", false, "show chunks")
	flag.StringVar(&filePath, "file", "", "file to read and chunk instead of the default sample")
	flag.Parse()

	sample := `
	This is some sentence.
	And this is a sentence with a big word: supercalifragilisticexpialidocious.

	And here are some split lines:
	a b
	x
	x
	bbb
	ccc
	`

	if filePath != "" {
	var bytes []byte
	var err error

	if filePath == "-" {
	bytes, err = io.ReadAll(os.Stdin)
	} else {
	bytes, err = os.ReadFile(filePath)
	}

	if err != nil {
	panic(err)
	}

	sample = string(bytes)
	}

	startTime := time.Now()
	chunks := split(sample, size)
	processingTime := time.Since(startTime)

	fmt.Printf("%d chunks -- processed in %.3f ms\n", len(chunks), float32(processingTime.Microseconds())/1000)
	if showChunks {
	getDigits := func(x int) int {
	return int(math.Log10(float64(x))) + 1
	}

	pattern := fmt.Sprintf("%%0%dd [%%0%dd] \| %%q\n", getDigits(len(chunks)), getDigits(size+1))
	for i, chunk := range chunks {
	fmt.Printf(pattern, i+1, len(chunk), chunk)
	}
	}
	}

	func split(str string, size int) []string {
	if size < 1 {
	return nil
	}

	str = strings.TrimSpace(str)

	start := 0
	chunks := make([]string, 0, len(str)/size)

	for start < len(str) {
	end := start + size
	if end >= len(str) {
	chunks = append(chunks, str[start:])
	break
	}

	// If the next character is a delimiter, take it
	// this is to avoid adding "-" at the end when the next character is a delimiter anyway
	next := str[end]
	if next == ' ' \|\| next == '\n' {
	end++
	}

	chunk := str[start:end]
	cutWord := false

	// Try to find a new line within the limit
	length := strings.LastIndex(chunk, "\n")

	// If no new line found, try to find a space
	if length == -1 {
	length = strings.LastIndex(chunk, " ")

	// If no space found, then just split the text
	if length == -1 {
	length = len(chunk) - 1 // leave space for "-" character that will be appended
	cutWord = true
	}
	}

	chunk = chunk[:length]
	start += length
	if cutWord {
	chunk += "-"
	} else {
	// Ignore the space that we stopped at
	start++
	}

	chunks = append(chunks, chunk)
	}

	return chunks
	}