Created
May 17, 2024 11:32
-
-
Save akx/bdde8c2184f6238e490e38b30620c650 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
./tests/test-tokenizer-0 ./models/ggml-vocab-viking-7b.gguf | |
main : reading vocab from: './models/ggml-vocab-viking-7b.gguf' | |
llama_model_loader: loaded meta data with 23 key-value pairs and 0 tensors from ./models/ggml-vocab-viking-7b.gguf (version GGUF V3 (latest)) | |
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
llama_model_loader: - kv 0: general.architecture str = llama | |
llama_model_loader: - kv 1: general.name str = viking-7b | |
llama_model_loader: - kv 2: llama.block_count u32 = 32 | |
llama_model_loader: - kv 3: llama.context_length u32 = 4096 | |
llama_model_loader: - kv 4: llama.embedding_length u32 = 4096 | |
llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 | |
llama_model_loader: - kv 6: llama.attention.head_count u32 = 32 | |
llama_model_loader: - kv 7: llama.attention.head_count_kv u32 = 32 | |
llama_model_loader: - kv 8: llama.rope.freq_base f32 = 10000.000000 | |
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 | |
llama_model_loader: - kv 10: general.file_type u32 = 1 | |
llama_model_loader: - kv 11: llama.vocab_size u32 = 131072 | |
llama_model_loader: - kv 12: llama.rope.dimension_count u32 = 128 | |
llama_model_loader: - kv 13: tokenizer.ggml.model str = gpt2 | |
llama_model_loader: - kv 14: tokenizer.ggml.pre str = viking-7b | |
llama_model_loader: - kv 15: tokenizer.ggml.tokens arr[str,131072] = ["<unk>", "<s>", "</s>", "<pad>", "<f... | |
llama_model_loader: - kv 16: tokenizer.ggml.token_type arr[i32,131072] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ... | |
llama_model_loader: - kv 17: tokenizer.ggml.merges arr[str,130814] = ["e r", "Ġ Ġ", "e n", "i n", "Ġ s"... | |
llama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 1 | |
llama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 2 | |
llama_model_loader: - kv 20: tokenizer.ggml.unknown_token_id u32 = 0 | |
llama_model_loader: - kv 21: tokenizer.ggml.padding_token_id u32 = 3 | |
llama_model_loader: - kv 22: general.quantization_version u32 = 2 | |
llm_load_vocab: mismatch in special tokens definition ( 11/131072 vs 24/131072 ). | |
llm_load_print_meta: format = GGUF V3 (latest) | |
llm_load_print_meta: arch = llama | |
llm_load_print_meta: vocab type = BPE | |
llm_load_print_meta: n_vocab = 131072 | |
llm_load_print_meta: n_merges = 130814 | |
llm_load_print_meta: n_ctx_train = 0 | |
llm_load_print_meta: n_embd = 0 | |
llm_load_print_meta: n_head = 0 | |
llm_load_print_meta: n_head_kv = 0 | |
llm_load_print_meta: n_layer = 0 | |
llm_load_print_meta: n_rot = 0 | |
llm_load_print_meta: n_embd_head_k = 0 | |
llm_load_print_meta: n_embd_head_v = 0 | |
llm_load_print_meta: n_gqa = 0 | |
llm_load_print_meta: n_embd_k_gqa = 0 | |
llm_load_print_meta: n_embd_v_gqa = 0 | |
llm_load_print_meta: f_norm_eps = 0.0e+00 | |
llm_load_print_meta: f_norm_rms_eps = 0.0e+00 | |
llm_load_print_meta: f_clamp_kqv = 0.0e+00 | |
llm_load_print_meta: f_max_alibi_bias = 0.0e+00 | |
llm_load_print_meta: f_logit_scale = 0.0e+00 | |
llm_load_print_meta: n_ff = 0 | |
llm_load_print_meta: n_expert = 0 | |
llm_load_print_meta: n_expert_used = 0 | |
llm_load_print_meta: causal attn = 1 | |
llm_load_print_meta: pooling type = 0 | |
llm_load_print_meta: rope type = -1 | |
llm_load_print_meta: rope scaling = none | |
llm_load_print_meta: freq_base_train = 0.0 | |
llm_load_print_meta: freq_scale_train = 0 | |
llm_load_print_meta: n_yarn_orig_ctx = 0 | |
llm_load_print_meta: rope_finetuned = unknown | |
llm_load_print_meta: ssm_d_conv = 0 | |
llm_load_print_meta: ssm_d_inner = 0 | |
llm_load_print_meta: ssm_d_state = 0 | |
llm_load_print_meta: ssm_dt_rank = 0 | |
llm_load_print_meta: model type = ?B | |
llm_load_print_meta: model ftype = all F32 | |
llm_load_print_meta: model params = 0.00 K | |
llm_load_print_meta: model size = 0.00 MiB (nan BPW) | |
llm_load_print_meta: general.name = viking-7b | |
llm_load_print_meta: BOS token = 1 '<s>' | |
llm_load_print_meta: EOS token = 2 '</s>' | |
llm_load_print_meta: UNK token = 0 '<unk>' | |
llm_load_print_meta: PAD token = 3 '<pad>' | |
llm_load_print_meta: LF token = 150 'Ä' | |
llm_load_print_meta: EOT token = 23 '<|im_end|>' | |
llama_model_load: vocab only - skipping tensors | |
llama_new_context_with_model: n_ctx = 512 | |
llama_new_context_with_model: n_batch = 512 | |
llama_new_context_with_model: n_ubatch = 512 | |
llama_new_context_with_model: flash_attn = 0 | |
llama_new_context_with_model: freq_base = 0.0 | |
llama_new_context_with_model: freq_scale = 1 | |
src: '' | |
res: '' | |
tok: | |
src: ' ' | |
res: ' ' | |
tok: 208 | |
src: ' | |
' | |
res: ' | |
' | |
tok: 125043 | |
src: ' | |
' | |
res: ' | |
' | |
tok: 209 | |
src: ' | |
' | |
res: ' | |
' | |
tok: 534 | |
src: ' | |
' | |
res: ' | |
' | |
tok: 4090 | |
src: ' | |
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL' | |
res: ' | |
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL' | |
tok: 746 2392 55899 208 29480 19883 103830 442 954 209 2781 259 233 376 24841 32 8819 138 68448 63329 128 15736 376 57353 1603 7704 294 31756 4787 1071 32 94674 3666 123 258 2781 123 258 231 42 231 42 42 231 42 42 42 231 42 42 42 42 231 42 42 42 42 42 231 42 42 42 42 42 42 231 42 42 42 42 42 42 42 231 42 42 42 42 42 42 42 42 231 42 37 42 231 42 478 42 231 42 919 42 231 33532 233 33532 138 33532 252 89464 244 33532 248 89464 235 33532 255 33532 139 33532 264 89464 234 33532 264 33532 119 33532 138 33532 238 27310 234 2748 23613 72376 13227 42284 105535 40 42 40 43 40 44 40 49056 43400 263 417 34550 130761 28469 17228 1085 13676 23950 128516 56400 30124 37322 108492 3437 3395 3395 30917 17846 2420 13728 3963 383 7029 1912 630 107 733 627 689 1923 35 630 1417 791 6189 54 630 68 835 6189 383 6704 2463 590 35 630 59 791 1647 2032 22940 54 2221 30 6815 279 79905 67 | |
main : failed test: ' | |
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL' | |
main : detokenized to: ' | |
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL' instead of ' | |
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL' | |
main : expected tokens: 746 ' | |
', 2392 ' | |
', 55899 ' | |
', 208 ' ', 29480 ' ', 19883 ' ', 103830 ' | |
', 442 ' | |
', 954 ' | |
', 209 ' | |
', 2781 '�', 259 '�', 233 '�', 376 ' (', 24841 'normal', 32 ')', 8819 ' �', 138 '�', 68448 '', 63329 '�', 128 '�', 15736 '️', 376 ' (', 57353 'multiple', 1603 ' em', 7704 'oj', 294 'is', 31756 ' conc', 4787 'aten', 1071 'ated', 32 ')', 94674 ' ✅', 3666 ' �', 123 '�', 258 '�', 2781 '�', 123 '�', 258 '�', 231 ' ', 42 '3', 231 ' ', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 37 '.', 42 '3', 231 ' ', 42 '3', 478 '..', 42 '3', 231 ' ', 42 '3', 919 '...', 42 '3', 231 ' ', 33532 '�', 233 '�', 33532 '�', 138 '�', 33532 '�', 252 '�', 89464 '�', 244 '�', 33532 '�', 248 '�', 89464 '�', 235 '�', 33532 '�', 255 '�', 33532 '�', 139 '�', 33532 '�', 264 '�', 89464 '�', 234 '�', 33532 '�', 264 '�', 33532 '�', 119 '�', 33532 '�', 138 '�', 33532 '�', 238 '�', 27310 '�', 234 '�', 2748 ' ?', 23613 '我', 72376 '想', 13227 '在', 42284 'apple', 105535 '工作', 40 '1', 42 '3', 40 '1', 43 '4', 40 '1', 44 '5', 40 '1', 49056 '天', 43400 '�', 263 '�', 417 ' -', 34550 '-----', 130761 '=======', 28469 ' не', 17228 'щ', 1085 'о', 13676 ' на', 23950 ' Б', 128516 'ългар', 56400 'ски', 30124 ' '''', 37322 ''''', 108492 '````', 3437 '```', 3395 '""', 3395 '""', 30917 '......', 17846 '!!!!', 2420 '!!', 13728 '????', 3963 '??', 9873 ' I've', 1912 ' been', 37493 ' 't', 733 'old', 17600 ' he's', 1923 ' there', 35 ',', 630 ' '', 1417 'RE', 791 ' you', 6189 ' sure', 54 '?', 23586 ' 'M', 835 ' not', 6189 ' sure', 18068 ' I'll', 2463 ' make', 590 ' it', 35 ',', 35018 ' 'D', 791 ' you', 1647 ' like', 2032 ' some', 22940 ' tea', 54 '?', 2221 ' We', 30 ''', 6815 'Ve', 279 ' a', 79905 ''l', 67 'L', | |
main : got tokens: 746 ' | |
', 2392 ' | |
', 55899 ' | |
', 208 ' ', 29480 ' ', 19883 ' ', 103830 ' | |
', 442 ' | |
', 954 ' | |
', 209 ' | |
', 2781 '�', 259 '�', 233 '�', 376 ' (', 24841 'normal', 32 ')', 8819 ' �', 138 '�', 68448 '', 63329 '�', 128 '�', 15736 '️', 376 ' (', 57353 'multiple', 1603 ' em', 7704 'oj', 294 'is', 31756 ' conc', 4787 'aten', 1071 'ated', 32 ')', 94674 ' ✅', 3666 ' �', 123 '�', 258 '�', 2781 '�', 123 '�', 258 '�', 231 ' ', 42 '3', 231 ' ', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 42 '3', 231 ' ', 42 '3', 37 '.', 42 '3', 231 ' ', 42 '3', 478 '..', 42 '3', 231 ' ', 42 '3', 919 '...', 42 '3', 231 ' ', 33532 '�', 233 '�', 33532 '�', 138 '�', 33532 '�', 252 '�', 89464 '�', 244 '�', 33532 '�', 248 '�', 89464 '�', 235 '�', 33532 '�', 255 '�', 33532 '�', 139 '�', 33532 '�', 264 '�', 89464 '�', 234 '�', 33532 '�', 264 '�', 33532 '�', 119 '�', 33532 '�', 138 '�', 33532 '�', 238 '�', 27310 '�', 234 '�', 2748 ' ?', 23613 '我', 72376 '想', 13227 '在', 42284 'apple', 105535 '工作', 40 '1', 42 '3', 40 '1', 43 '4', 40 '1', 44 '5', 40 '1', 49056 '天', 43400 '�', 263 '�', 417 ' -', 34550 '-----', 130761 '=======', 28469 ' не', 17228 'щ', 1085 'о', 13676 ' на', 23950 ' Б', 128516 'ългар', 56400 'ски', 30124 ' '''', 37322 ''''', 108492 '````', 3437 '```', 3395 '""', 3395 '""', 30917 '......', 17846 '!!!!', 2420 '!!', 13728 '????', 3963 '??', 383 ' I', 7029 ''ve', 1912 ' been', 630 ' '', 107 't', 733 'old', 627 ' he', 689 ''s', 1923 ' there', 35 ',', 630 ' '', 1417 'RE', 791 ' you', 6189 ' sure', 54 '?', 630 ' '', 68 'M', 835 ' not', 6189 ' sure', 383 ' I', 6704 ''ll', 2463 ' make', 590 ' it', 35 ',', 630 ' '', 59 'D', 791 ' you', 1647 ' like', 2032 ' some', 22940 ' tea', 54 '?', 2221 ' We', 30 ''', 6815 'Ve', 279 ' a', 79905 ''l', 67 'L', | |
src: ' | |
=' | |
res: ' | |
=' | |
tok: 209 449 | |
src: ' ' | |
res: ' ' | |
tok: 231 | |
src: ' ' | |
res: ' ' | |
tok: 268 | |
src: ' ' | |
res: ' ' | |
tok: 348 | |
src: ' Hello' | |
res: ' Hello' | |
tok: 348 40540 | |
src: ' Hello | |
Hello' | |
res: ' Hello | |
Hello' | |
tok: 348 40540 209 348 40540 | |
main : failed test: ' Hello | |
Hello' | |
main : detokenized to: ' Hello | |
Hello' instead of ' Hello | |
Hello' | |
main : expected tokens: 348 ' ', 40540 ' Hello', 472 ' | |
', 40540 ' Hello', | |
main : got tokens: 348 ' ', 40540 ' Hello', 209 ' | |
', 348 ' ', 40540 ' Hello', | |
src: ' Hello' | |
res: ' Hello' | |
tok: 268 40540 | |
src: ' Hello' | |
res: ' Hello' | |
tok: 231 40540 | |
src: ' (' | |
res: ' (' | |
tok: 376 | |
src: ' Hello' | |
res: ' Hello' | |
tok: 40540 | |
src: ' Hello World' | |
res: ' Hello World' | |
tok: 40540 5685 | |
src: ' Hello World!' | |
res: ' Hello World!' | |
tok: 40540 5685 24 | |
src: ' Hello world' | |
res: ' Hello world' | |
tok: 40540 3576 | |
src: ' Hello, world!' | |
res: ' Hello, world!' | |
tok: 40540 35 3576 24 | |
src: ' this is 🦙.cpp' | |
res: ' this is 🦙.cpp' | |
tok: 749 501 3666 123 258 37 20744 | |
src: '' era' | |
res: '' era' | |
tok: 30 18323 | |
src: '3' | |
res: '3' | |
tok: 42 | |
src: '33' | |
res: '33' | |
tok: 42 42 | |
src: '333' | |
res: '333' | |
tok: 42 42 42 | |
src: '3333' | |
res: '3333' | |
tok: 42 42 42 42 | |
src: '33333' | |
res: '33333' | |
tok: 42 42 42 42 42 | |
src: '333333' | |
res: '333333' | |
tok: 42 42 42 42 42 42 | |
src: '3333333' | |
res: '3333333' | |
tok: 42 42 42 42 42 42 42 | |
src: '33333333' | |
res: '33333333' | |
tok: 42 42 42 42 42 42 42 42 | |
src: '333333333' | |
res: '333333333' | |
tok: 42 42 42 42 42 42 42 42 42 | |
src: 'Führer' | |
res: 'Führer' | |
tok: 61 48261 3214 | |
src: 'Hello' | |
res: 'Hello' | |
tok: 44312 | |
src: 'Hello World' | |
res: 'Hello World' | |
tok: 44312 5685 | |
src: 'Hello world' | |
res: 'Hello world' | |
tok: 44312 3576 | |
src: 'Hello, world!' | |
res: 'Hello, world!' | |
tok: 44312 35 3576 24 | |
src: 'Hello, y'all! How are you 😁 ?我想在apple工作1314151天~' | |
res: 'Hello, y'all! How are you 😁 ?我想在apple工作1314151天~' | |
tok: 44312 35 426 30 451 24 4686 788 791 8819 234 2748 23613 72376 13227 42284 105535 40 42 40 43 40 44 40 49056 43400 263 | |
src: 'ied 4 ½ months' | |
res: 'ied 4 ½ months' | |
tok: 1502 231 43 231 1177 9290 | |
src: 'w048 7tuijk dsdfhu' | |
res: 'w048 7tuijk dsdfhu' | |
tok: 110 39 43 47 231 46 107 2193 23909 49979 2758 6052 | |
src: 'нещо на Български' | |
res: 'нещо на Български' | |
tok: 62602 17228 1085 13676 23950 128516 56400 | |
src: 'កាន់តែពិសេសអាចខលចេញ' | |
res: 'កាន់តែពិសេសអាចខលចេញ' | |
tok: 33532 233 33532 138 33532 252 89464 244 33532 248 89464 235 33532 255 33532 139 33532 264 89464 234 33532 264 33532 119 33532 138 33532 238 33532 234 33532 260 33532 238 89464 234 33532 242 | |
src: '🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)' | |
res: '🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)' | |
tok: 2781 259 233 376 24841 32 8819 138 68448 63329 128 15736 376 57353 1603 7704 294 31756 4787 1071 32 94674 376 8418 130228 619 1192 1658 3623 13129 32 | |
Tests failed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment