Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ink-splatters/d0899a4749261b7dfd4818f40ff062b4 to your computer and use it in GitHub Desktop.
Save ink-splatters/d0899a4749261b7dfd4818f40ff062b4 to your computer and use it in GitHub Desktop.
llama.cpp choosing BPE pre-tokenizer logic
airstation:llama.cpp ic$ git rev-parse HEAD
952d03dbead16e4dbdd1d3458486340673cc2465
airstation:llama.cpp ic$ echo ; awk '(NR>=4341 && NR<=4382 ){print NR " " $0}' llama.cpp
4341 // for now, only BPE models have pre-tokenizers
4342 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4343 if (tokenizer_pre.empty()) {
4344 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4345 LLAMA_LOG_WARN("%s: \n", __func__);
4346 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4347 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4348 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4349 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4350 LLAMA_LOG_WARN("%s: \n", __func__);
4351 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4352 } else if (
4353 tokenizer_pre == "default") {
4354 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4355 } else if (
4356 tokenizer_pre == "llama3" ||
4357 tokenizer_pre == "llama-v3" ||
4358 tokenizer_pre == "llama-bpe") {
4359 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4360 } else if (
4361 tokenizer_pre == "deepseek-llm") {
4362 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4363 } else if (
4364 tokenizer_pre == "deepseek-coder") {
4365 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4366 } else if (
4367 tokenizer_pre == "falcon") {
4368 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4369 } else if (
4370 tokenizer_pre == "mpt") {
4371 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4372 } else if (
4373 tokenizer_pre == "starcoder") {
4374 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4375 } else if (
4376 tokenizer_pre == "gpt-2") {
4377 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4378 } else {
4379 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4380 }
4381 } else {
4382 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment