Skip to content

Instantly share code, notes, and snippets.

@beachside-project
Created July 27, 2023 23:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beachside-project/9b5a61d49e6facf95ba936785f115f9b to your computer and use it in GitHub Desktop.
Save beachside-project/9b5a61d49e6facf95ba936785f115f9b to your computer and use it in GitHub Desktop.
Tokenizer sample
using Microsoft.DeepDev;
namespace OpenAISdkSamples;
public class TokenizerSample
{
private const string ImStart = "<|im_start|>";
private const string ImEnd = "<|im_end|>";
private static readonly Dictionary<string, int> SpecialTokens = new()
{
{ ImStart,100264},
{ ImEnd, 100265},
};
private static readonly IReadOnlyCollection<string> AllowedSpecial = new HashSet<string>(SpecialTokens.Keys);
public async Task RunAsync()
{
var text = "こんにちは、猫さん";
var count = await CountTokenAsync(text, "gpt-3.5-turbo");
Console.WriteLine(count);
}
public async Task<int> CountTokenAsync(string text, string modelName)
{
var tokenizer = await TokenizerBuilder.CreateByModelNameAsync(modelName, SpecialTokens);
var input = $"{ImStart}{text}{ImEnd}";
var encoded = tokenizer.Encode(input, AllowedSpecial);
return encoded.Count - 2;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment