btnguyen2k/oai_esttokens.go

## oai_esttokens.go
// EstimateTokens estimates the number of tokes for an input string.
func EstimateTokens(input string) int {
	const re1 = `[^\w\d]+`
	const re2 = `[\w\d]+`
	reWords := regexp.MustCompile(re1)
	words := reWords.Split(input, -1)
	numWords := 0
	for _, w := range words {
		if w != "" {
			numBytes := len([]byte(w))
			numWords += int(math.Ceil(float64(numBytes) / 4.0))
		}
	}

	reNonWords := regexp.MustCompile(re2)
	nonWords := reNonWords.Split(input, -1)
	numNonWords := 0
	for _, nw := range nonWords {
		if nw != "" {
			numBytes := len([]byte(nw))
			numNonWords += numBytes
		}
	}

	numBytes := len([]byte(input))
	return ((numWords*4/3 + numNonWords) + numBytes/4) / 2
}
	// EstimateTokens estimates the number of tokes for an input string.
	func EstimateTokens(input string) int {
	const re1 = `[^\w\d]+`
	const re2 = `[\w\d]+`
	reWords := regexp.MustCompile(re1)
	words := reWords.Split(input, -1)
	numWords := 0
	for _, w := range words {
	if w != "" {
	numBytes := len([]byte(w))
	numWords += int(math.Ceil(float64(numBytes) / 4.0))
	}
	}

	reNonWords := regexp.MustCompile(re2)
	nonWords := reNonWords.Split(input, -1)
	numNonWords := 0
	for _, nw := range nonWords {
	if nw != "" {
	numBytes := len([]byte(nw))
	numNonWords += numBytes
	}
	}

	numBytes := len([]byte(input))
	return ((numWords*4/3 + numNonWords) + numBytes/4) / 2
	}