Created
May 3, 2023 08:40
-
-
Save btnguyen2k/2cadc210558714d1646f42a07a4bff5f to your computer and use it in GitHub Desktop.
Estimate the number of tokens for an OpenAI prompt.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// EstimateTokens estimates the number of tokes for an input string. | |
func EstimateTokens(input string) int { | |
const re1 = `[^\w\d]+` | |
const re2 = `[\w\d]+` | |
reWords := regexp.MustCompile(re1) | |
words := reWords.Split(input, -1) | |
numWords := 0 | |
for _, w := range words { | |
if w != "" { | |
numBytes := len([]byte(w)) | |
numWords += int(math.Ceil(float64(numBytes) / 4.0)) | |
} | |
} | |
reNonWords := regexp.MustCompile(re2) | |
nonWords := reNonWords.Split(input, -1) | |
numNonWords := 0 | |
for _, nw := range nonWords { | |
if nw != "" { | |
numBytes := len([]byte(nw)) | |
numNonWords += numBytes | |
} | |
} | |
numBytes := len([]byte(input)) | |
return ((numWords*4/3 + numNonWords) + numBytes/4) / 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment