Created
February 4, 2024 09:41
-
-
Save alex4o/12a29218a9860a8f1dad7c087adfa6b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Log start | |
main: build = 22 (277fad3) | |
main: built with clang version 17.0.6 for aarch64-unknown-linux-android24 | |
main: seed = 1707038499 | |
ggml_vulkan: Using Mali-G78 | uma: 1 | fp16: 1 | warp size: 16 | |
llama_model_loader: loaded meta data with 21 key-value pairs and 325 tensors from ../phi-2-orange.Q4_K_M.gguf (version GGUF V3 (latest)) | |
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
llama_model_loader: - kv 0: general.architecture str = phi2 | |
llama_model_loader: - kv 1: general.name str = Phi2 | |
llama_model_loader: - kv 2: phi2.context_length u32 = 2048 | |
llama_model_loader: - kv 3: phi2.embedding_length u32 = 2560 | |
llama_model_loader: - kv 4: phi2.feed_forward_length u32 = 10240 | |
llama_model_loader: - kv 5: phi2.block_count u32 = 32 | |
llama_model_loader: - kv 6: phi2.attention.head_count u32 = 32 | |
llama_model_loader: - kv 7: phi2.attention.head_count_kv u32 = 32 | |
llama_model_loader: - kv 8: phi2.attention.layer_norm_epsilon f32 = 0.000010 | |
llama_model_loader: - kv 9: phi2.rope.dimension_count u32 = 32 | |
llama_model_loader: - kv 10: general.file_type u32 = 15 | |
llama_model_loader: - kv 11: tokenizer.ggml.add_bos_token bool = false | |
llama_model_loader: - kv 12: tokenizer.ggml.model str = gpt2 | |
llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,51200] = ["!", "\"", "#", "$", "%", "&", "'", ... | |
llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,51200] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
llama_model_loader: - kv 15: tokenizer.ggml.merges arr[str,50000] = ["Ġ t", "Ġ a", "h e", "i n", "r e",... | |
llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 50256 | |
llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 50295 | |
llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 50256 | |
llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 50256 | |
llama_model_loader: - kv 20: general.quantization_version u32 = 2 | |
llama_model_loader: - type f32: 195 tensors | |
llama_model_loader: - type q4_K: 81 tensors | |
llama_model_loader: - type q5_K: 32 tensors | |
llama_model_loader: - type q6_K: 17 tensors | |
llm_load_vocab: mismatch in special tokens definition ( 910/51200 vs 944/51200 ). | |
llm_load_print_meta: format = GGUF V3 (latest) | |
llm_load_print_meta: arch = phi2 | |
llm_load_print_meta: vocab type = BPE | |
llm_load_print_meta: n_vocab = 51200 | |
llm_load_print_meta: n_merges = 50000 | |
llm_load_print_meta: n_ctx_train = 2048 | |
llm_load_print_meta: n_embd = 2560 | |
llm_load_print_meta: n_head = 32 | |
llm_load_print_meta: n_head_kv = 32 | |
llm_load_print_meta: n_layer = 32 | |
llm_load_print_meta: n_rot = 32 | |
llm_load_print_meta: n_embd_head_k = 80 | |
llm_load_print_meta: n_embd_head_v = 80 | |
llm_load_print_meta: n_gqa = 1 | |
llm_load_print_meta: n_embd_k_gqa = 2560 | |
llm_load_print_meta: n_embd_v_gqa = 2560 | |
llm_load_print_meta: f_norm_eps = 1.0e-05 | |
llm_load_print_meta: f_norm_rms_eps = 0.0e+00 | |
llm_load_print_meta: f_clamp_kqv = 0.0e+00 | |
llm_load_print_meta: f_max_alibi_bias = 0.0e+00 | |
llm_load_print_meta: n_ff = 10240 | |
llm_load_print_meta: n_expert = 0 | |
llm_load_print_meta: n_expert_used = 0 | |
llm_load_print_meta: rope scaling = linear | |
llm_load_print_meta: freq_base_train = 10000.0 | |
llm_load_print_meta: freq_scale_train = 1 | |
llm_load_print_meta: n_yarn_orig_ctx = 2048 | |
llm_load_print_meta: rope_finetuned = unknown | |
llm_load_print_meta: model type = 3B | |
llm_load_print_meta: model ftype = Q4_K - Medium | |
llm_load_print_meta: model params = 2.78 B | |
llm_load_print_meta: model size = 1.66 GiB (5.14 BPW) | |
llm_load_print_meta: general.name = Phi2 | |
llm_load_print_meta: BOS token = 50256 '<|endoftext|>' | |
llm_load_print_meta: EOS token = 50295 '<|im_end|>' | |
llm_load_print_meta: UNK token = 50256 '<|endoftext|>' | |
llm_load_print_meta: PAD token = 50256 '<|endoftext|>' | |
llm_load_print_meta: LF token = 128 'Ä' | |
llm_load_tensors: ggml ctx size = 0.12 MiB | |
llm_load_tensors: offloading 0 repeating layers to GPU | |
llm_load_tensors: offloaded 0/33 layers to GPU | |
llm_load_tensors: CPU buffer size = 1704.63 MiB | |
.................................................................................. | |
llama_new_context_with_model: n_ctx = 512 | |
llama_new_context_with_model: freq_base = 10000.0 | |
llama_new_context_with_model: freq_scale = 1 | |
llama_kv_cache_init: Vulkan_Host KV buffer size = 160.00 MiB | |
llama_new_context_with_model: KV self size = 160.00 MiB, K (f16): 80.00 MiB, V (f16): 80.00 MiB | |
llama_new_context_with_model: Vulkan_Host input buffer size = 6.01 MiB | |
llama_new_context_with_model: Vulkan_Host compute buffer size = 115.50 MiB | |
llama_new_context_with_model: graph splits (measure): 1 | |
TEST TRANSFER 32000 KB to_gpu 8.06ms (3877.17 MB/s) from_gpu 36.01ms (867.814 MB/s) avg_err=0 | |
TEST TRANSFER 32000 KB to_gpu 20.997ms (1488.31 MB/s) from_gpu 18.752ms (1666.49 MB/s) avg_err=0 | |
TEST DEQUANT q4_0 time=21.397ms avg_err=0.0293562 | |
TEST DEQUANT q4_1 time=29.373ms avg_err=0.0146816 | |
TEST DEQUANT q5_0 time=23.029ms avg_err=0.0146827 | |
TEST DEQUANT q5_1 time=21.936ms avg_err=nan | |
TEST DEQUANT q8_0 time=28.253ms avg_err=0.00185731 | |
TEST DEQUANT q2_K time=12ms avg_err=0.0656395 | |
TEST DEQUANT q3_K time=17.001ms avg_err=0.0533495 | |
TEST DEQUANT q4_K time=16.921ms avg_err=0.0142783 | |
TEST DEQUANT q5_K time=22.388ms avg_err=0.00707844 | |
TEST DEQUANT q6_K time=16.976ms avg_err=0.00638116 | |
TEST F16_F32_S m=8 n=8 k=8 batch=2 split_k=1 matmul 2.859ms avg_err=3.5885e-08 | |
TEST F16_F32_M m=8 n=8 k=8 batch=2 split_k=1 matmul 2.564ms avg_err=3.30328e-08 | |
TEST F16_F32_L m=8 n=8 k=8 batch=2 split_k=1 matmul 5.738ms avg_err=3.87663e-08 | |
TEST F16_F32_S m=8 n=8 k=8 batch=2 split_k=4 matmul 2.974ms avg_err=4.12692e-08 | |
TEST F16_F32_M m=8 n=8 k=8 batch=2 split_k=4 matmul 2.442ms avg_err=2.75904e-08 | |
TEST F16_F32_L m=8 n=8 k=8 batch=2 split_k=4 matmul 5.539ms avg_err=2.49129e-08 | |
TEST F16_F32_ALIGNED_S m=100 n=46 k=576 batch=2 split_k=1 matmul 86.319ms avg_err=0.00903933 | |
TEST F16_F32_ALIGNED_M m=100 n=46 k=576 batch=2 split_k=1 matmul 17.755ms avg_err=0.00895996 | |
TEST F16_F32_L m=100 n=46 k=576 batch=2 split_k=1 matmul 59.948ms avg_err=0.00887973 | |
TEST F16_F32_ALIGNED_S m=100 n=46 k=576 batch=2 split_k=4 matmul 13.571ms avg_err=0.00899095 | |
TEST F16_F32_ALIGNED_M m=100 n=46 k=576 batch=2 split_k=4 matmul 5.972ms avg_err=0.00901088 | |
TEST F16_F32_L m=100 n=46 k=576 batch=2 split_k=4 matmul 20.176ms avg_err=0.00898059 | |
TEST F16_F32_ALIGNED_S m=623 n=111 k=128 batch=2 split_k=1 matmul 16.266ms avg_err=0.00273154 | |
TEST F16_F32_ALIGNED_M m=623 n=111 k=128 batch=2 split_k=1 matmul 5.549ms avg_err=0.00270985 | |
TEST F16_F32_ALIGNED_L m=623 n=111 k=128 batch=2 split_k=1 matmul 17.037ms avg_err=0.00272796 | |
TEST F16_F32_ALIGNED_S m=623 n=111 k=128 batch=2 split_k=4 matmul 10.029ms avg_err=0.00273871 | |
TEST F16_F32_ALIGNED_M m=623 n=111 k=128 batch=2 split_k=4 matmul 12.533ms avg_err=0.00274415 | |
TEST F16_F32_ALIGNED_L m=623 n=111 k=128 batch=2 split_k=4 matmul 11.393ms avg_err=0.0027185 | |
TEST F16_F32_S m=100 n=46 k=558 batch=2 split_k=1 matmul 61.629ms avg_err=0.00864928 | |
TEST F16_F32_M m=100 n=46 k=558 batch=2 split_k=1 matmul 27.739ms avg_err=0.00865995 | |
TEST F16_F32_L m=100 n=46 k=558 batch=2 split_k=1 matmul 57.084ms avg_err=0.00878766 | |
TEST F16_F32_S m=100 n=46 k=558 batch=2 split_k=4 matmul 12.523ms avg_err=0.00863343 | |
TEST F16_F32_M m=100 n=46 k=558 batch=2 split_k=4 matmul 7.564ms avg_err=0.00865965 | |
TEST F16_F32_L m=100 n=46 k=558 batch=2 split_k=4 matmul 16.165ms avg_err=0.00857469 | |
TEST F16_F32_ALIGNED_S m=512 n=1 k=256 batch=2 split_k=1 matmul 23.553ms avg_err=0.00470129 | |
TEST F16_F32_ALIGNED_M m=512 n=1 k=256 batch=2 split_k=1 matmul 9.019ms avg_err=0.00476561 | |
TEST F16_F32_ALIGNED_L m=512 n=1 k=256 batch=2 split_k=1 matmul 31.247ms avg_err=0.00465407 | |
TEST F16_F32_ALIGNED_S m=512 n=1 k=256 batch=2 split_k=4 matmul 5.74ms avg_err=0.00457673 | |
TEST F16_F32_ALIGNED_M m=512 n=1 k=256 batch=2 split_k=4 matmul 4.838ms avg_err=0.00430648 | |
TEST F16_F32_ALIGNED_L m=512 n=1 k=256 batch=2 split_k=4 matmul 14.278ms avg_err=0.00468248 | |
TEST F16_F32_S m=128 n=110 k=622 batch=2 split_k=1 matmul 59.717ms avg_err=0.0094636 | |
TEST F16_F32_M m=128 n=110 k=622 batch=2 split_k=1 matmul 29.549ms avg_err=0.00956401 | |
TEST F16_F32_L m=128 n=110 k=622 batch=2 split_k=1 matmul 70.088ms avg_err=0.00949898 | |
TEST F16_F32_S m=128 n=110 k=622 batch=2 split_k=4 matmul 14.395ms avg_err=0.0094583 | |
TEST F16_F32_M m=128 n=110 k=622 batch=2 split_k=4 matmul 8.736ms avg_err=0.00950278 | |
TEST F16_F32_L m=128 n=110 k=622 batch=2 split_k=4 matmul 21.581ms avg_err=0.00953997 | |
TEST F16_F32_S m=511 n=511 k=127 batch=2 split_k=1 matmul 31.273ms avg_err=0.00223408 | |
TEST F16_F32_M m=511 n=511 k=127 batch=2 split_k=1 matmul 42.991ms avg_err=0.00222613 | |
TEST F16_F32_L m=511 n=511 k=127 batch=2 split_k=1 matmul 30.671ms avg_err=0.00223636 | |
TEST F16_F32_S m=511 n=511 k=127 batch=2 split_k=4 matmul 28.468ms avg_err=0.00222728 | |
TEST F16_F32_M m=511 n=511 k=127 batch=2 split_k=4 matmul 46.069ms avg_err=0.00222255 | |
TEST F16_F32_L m=511 n=511 k=127 batch=2 split_k=4 matmul 32.965ms avg_err=0.00223303 | |
TEST F16_F32_S m=511 n=511 k=7 batch=2 split_k=1 matmul 4.296ms avg_err=2.91652e-08 | |
TEST F16_F32_M m=511 n=511 k=7 batch=2 split_k=1 matmul 5.679ms avg_err=2.96846e-08 | |
TEST F16_F32_L m=511 n=511 k=7 batch=2 split_k=1 matmul 4.486ms avg_err=2.85018e-08 | |
TEST F16_F32_S m=511 n=511 k=7 batch=2 split_k=4 matmul 14.936ms avg_err=2.54198e-08 | |
TEST F16_F32_M m=511 n=511 k=7 batch=2 split_k=4 matmul 30.942ms avg_err=2.55662e-08 | |
TEST F16_F32_L m=511 n=511 k=7 batch=2 split_k=4 matmul 20.046ms avg_err=2.5584e-08 | |
TEST F16_F32_S m=511 n=511 k=17 batch=2 split_k=1 matmul 9.332ms avg_err=1.07428e-07 | |
TEST F16_F32_M m=511 n=511 k=17 batch=2 split_k=1 matmul 14.032ms avg_err=1.07329e-07 | |
TEST F16_F32_L m=511 n=511 k=17 batch=2 split_k=1 matmul 7.84ms avg_err=1.08659e-07 | |
TEST F16_F32_S m=511 n=511 k=17 batch=2 split_k=4 matmul 15.905ms avg_err=8.60783e-08 | |
TEST F16_F32_M m=511 n=511 k=17 batch=2 split_k=4 matmul 23.655ms avg_err=8.60826e-08 | |
TEST F16_F32_L m=511 n=511 k=17 batch=2 split_k=4 matmul 14.892ms avg_err=8.52428e-08 | |
TEST F16_F32_ALIGNED_S m=49 n=49 k=128 batch=2 split_k=1 matmul 9.133ms avg_err=0.00271611 | |
TEST F16_F32_ALIGNED_M m=49 n=49 k=128 batch=2 split_k=1 matmul 5.94ms avg_err=0.00263445 | |
TEST F16_F32_ALIGNED_L m=49 n=49 k=128 batch=2 split_k=1 matmul 19.579ms avg_err=0.00275594 | |
TEST F16_F32_ALIGNED_S m=49 n=49 k=128 batch=2 split_k=4 matmul 4.908ms avg_err=0.00269092 | |
TEST F16_F32_ALIGNED_M m=49 n=49 k=128 batch=2 split_k=4 matmul 1.916ms avg_err=0.00272351 | |
TEST F16_F32_ALIGNED_L m=49 n=49 k=128 batch=2 split_k=4 matmul 5.861ms avg_err=0.00275323 | |
TEST F16_F32_S m=128 n=49 k=49 batch=2 split_k=1 matmul 9.584ms avg_err=0.00113111 | |
TEST F16_F32_M m=128 n=49 k=49 batch=2 split_k=1 matmul 4.823ms avg_err=0.00111428 | |
TEST F16_F32_L m=128 n=49 k=49 batch=2 split_k=1 matmul 10.616ms avg_err=0.00114444 | |
TEST F16_F32_S m=128 n=49 k=49 batch=2 split_k=4 matmul 2.829ms avg_err=0.00109426 | |
TEST F16_F32_M m=128 n=49 k=49 batch=2 split_k=4 matmul 1.845ms avg_err=0.0011375 | |
TEST F16_F32_L m=128 n=49 k=49 batch=2 split_k=4 matmul 3.229ms avg_err=0.00112114 | |
TEST F16_F32_ALIGNED_S m=4096 n=49 k=4096 batch=2 split_k=1 matmul 756.405ms avg_err=0.0572566 | |
TEST F16_F32_ALIGNED_M m=4096 n=49 k=4096 batch=2 split_k=1 matmul 541.39ms avg_err=0.0572785 | |
TEST F16_F32_ALIGNED_L m=4096 n=49 k=4096 batch=2 split_k=1 matmul 1341.27ms avg_err=0.0572805 | |
TEST F16_F32_ALIGNED_S m=4096 n=49 k=4096 batch=2 split_k=4 matmul 694.013ms avg_err=0.0573234 | |
TEST F16_F32_ALIGNED_M m=4096 n=49 k=4096 batch=2 split_k=4 matmul 555.439ms avg_err=0.057422 | |
TEST F16_F32_ALIGNED_L m=4096 n=49 k=4096 batch=2 split_k=4 matmul 1397.91ms avg_err=0.0573238 | |
TEST F16_F32_ALIGNED_S m=11008 n=49 k=4096 batch=2 split_k=1 matmul 1822.45ms avg_err=0.0573789 | |
TEST F16_F32_ALIGNED_M m=11008 n=49 k=4096 batch=2 split_k=1 matmul 1592.09ms avg_err=0.057395 | |
TEST F16_F32_ALIGNED_L m=11008 n=49 k=4096 batch=2 split_k=1 matmul 3678.7ms avg_err=0.0572639 | |
TEST F16_F32_ALIGNED_S m=11008 n=49 k=4096 batch=2 split_k=4 matmul 1877.84ms avg_err=0.0573863 | |
TEST F16_F32_ALIGNED_M m=11008 n=49 k=4096 batch=2 split_k=4 matmul 1456.92ms avg_err=0.0574161 | |
TEST F16_F32_ALIGNED_L m=11008 n=49 k=4096 batch=2 split_k=4 matmul 3747.22ms avg_err=0.0572428 | |
TEST F16_F32_ALIGNED_S m=4096 n=49 k=11008 batch=2 split_k=1 matmul 1979.79ms avg_err=0.152192 | |
m = 0 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -17.87 -50.06 -11.64 -52.56 31.11 -10.86 -13.44 -34.79 15.54 7.14 | |
1: -32.12 -6.01 -29.48 -17.35 -11.25 2.02 -15.52 54.26 35.29 -53.10 | |
2: 23.63 -13.42 18.13 49.77 -14.42 29.23 -18.03 2.04 -4.12 8.22 | |
3: 55.25 81.20 -2.17 39.63 25.50 -9.21 48.94 1.62 -25.20 11.26 | |
4: 48.38 -44.08 -11.95 18.48 11.89 12.64 41.81 -79.25 -8.71 48.89 | |
5: 21.71 -38.49 -16.96 17.53 -8.42 -50.59 17.04 20.67 41.56 76.08 | |
6: 41.13 18.94 20.54 34.14 -3.34 22.95 55.09 22.23 38.34 51.14 | |
7: 13.83 -42.59 -29.14 -31.61 -16.67 -3.19 40.18 2.50 23.52 16.57 | |
8: 51.12 -8.72 41.19 18.01 -16.29 70.44 14.46 25.14 -51.99 -3.95 | |
9: -26.11 -12.26 -17.84 -15.50 -56.44 -15.93 7.32 -23.10 24.40 29.93 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -17.80 -49.98 -11.67 -52.55 31.14 -10.86 -13.49 -34.77 15.53 7.22 | |
1: -32.29 -5.92 -29.43 -17.33 -11.16 2.00 -15.88 54.27 35.38 -53.06 | |
2: 23.74 -13.34 17.98 49.71 -14.31 29.21 -17.84 1.74 -4.24 8.25 | |
3: 55.22 81.10 -2.19 39.67 25.49 -9.16 48.90 1.49 -25.20 11.34 | |
4: 48.54 -43.98 -11.92 18.48 11.84 12.74 42.04 -79.39 -8.75 49.06 | |
5: 21.73 -38.61 -17.00 17.67 -8.37 -50.54 17.08 20.55 41.47 76.16 | |
6: 41.27 18.85 20.47 34.10 -3.24 22.87 55.06 22.14 38.24 50.96 | |
7: 13.84 -42.67 -29.16 -31.53 -16.67 -3.27 40.18 2.40 23.40 16.64 | |
8: 51.19 -8.74 41.23 17.99 -16.27 70.27 14.61 25.09 -51.99 -3.93 | |
9: -26.02 -12.21 -17.64 -15.54 -56.61 -15.92 7.31 -23.03 24.67 29.93 | |
TEST F16_F32_ALIGNED_M m=4096 n=49 k=11008 batch=2 split_k=1 matmul 1439.12ms avg_err=0.151754 | |
m = 1 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -74.45 -25.07 54.53 -16.59 -14.70 -16.67 36.47 -2.26 27.39 -32.49 | |
1: -55.27 1.92 2.54 5.26 19.99 -30.06 -8.50 -57.23 -4.16 35.76 | |
2: -28.25 76.18 -53.84 19.85 -3.38 14.56 24.12 -21.09 31.17 -21.44 | |
3: 4.86 52.00 -22.12 -35.08 -15.20 49.38 -25.48 -12.53 39.74 11.47 | |
4: -34.18 23.09 -7.30 5.01 45.95 30.02 10.92 10.43 63.84 17.27 | |
5: -2.90 -13.85 8.22 -8.83 -56.53 -12.54 -37.01 -12.72 5.87 -27.44 | |
6: -3.03 -1.73 -0.06 -4.27 -33.03 -34.29 35.04 8.65 -9.15 -41.63 | |
7: 13.18 -71.36 41.10 -29.28 -13.33 0.23 -10.22 83.58 -11.66 -2.75 | |
8: 31.76 -48.48 -58.95 -46.91 3.32 -15.96 -35.25 63.11 38.69 24.80 | |
9: -31.08 36.35 4.79 -3.71 15.88 11.95 -15.13 -9.28 -72.99 -2.54 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -74.45 -24.96 54.57 -16.53 -14.64 -16.70 36.46 -2.29 27.43 -32.45 | |
1: -55.22 1.98 2.55 5.26 20.16 -30.08 -8.57 -57.25 -4.13 35.75 | |
2: -28.24 76.31 -53.92 19.77 -3.45 14.53 24.16 -21.18 31.20 -21.39 | |
3: 4.81 52.09 -22.10 -35.13 -15.23 49.38 -25.47 -12.63 39.62 11.69 | |
4: -34.06 22.98 -7.22 4.98 45.81 29.98 10.66 10.32 63.86 17.23 | |
5: -3.00 -13.70 8.19 -8.77 -56.48 -12.62 -37.00 -12.69 5.92 -27.35 | |
6: -2.95 -1.65 0.07 -4.26 -33.20 -34.23 35.10 8.46 -9.02 -41.68 | |
7: 13.19 -71.34 41.07 -29.36 -13.35 0.16 -10.34 83.66 -11.74 -2.84 | |
8: 31.72 -48.59 -58.94 -46.90 3.29 -15.96 -35.15 63.12 38.75 24.81 | |
9: -31.02 36.23 4.89 -3.55 15.86 11.86 -15.26 -9.10 -72.96 -2.41 | |
TEST F16_F32_ALIGNED_L m=4096 n=49 k=11008 batch=2 split_k=1 matmul 3454.31ms avg_err=0.151801 | |
m = 1 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 18.17 -2.25 -67.63 -33.97 -10.74 43.64 -4.12 12.42 51.52 3.73 | |
1: -4.78 10.80 32.87 18.84 18.91 4.22 25.46 -1.14 49.70 -43.36 | |
2: -46.69 -25.41 -7.07 -5.13 25.78 -28.94 -1.67 39.56 -46.60 2.28 | |
3: 36.48 27.70 13.99 -8.60 -65.40 -27.46 -16.12 -7.55 -25.10 15.22 | |
4: 20.87 -28.30 2.24 -3.36 -39.19 45.26 61.22 6.35 22.07 52.99 | |
5: 4.59 -26.73 -22.62 -60.58 -55.66 30.26 -79.79 -65.88 -27.61 -48.30 | |
6: 20.34 30.11 6.16 -48.16 16.49 26.33 -11.49 -40.90 -18.52 -45.31 | |
7: -44.88 -23.81 -65.73 -22.00 56.39 -39.39 -55.03 -56.62 -18.21 -19.29 | |
8: -26.94 13.33 -33.63 9.77 22.20 -4.49 -14.42 41.07 -23.99 32.86 | |
9: 18.77 50.34 1.46 -0.15 23.63 -31.53 45.94 -47.68 3.09 -11.42 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 18.19 -2.11 -67.61 -33.96 -10.82 43.70 -4.10 12.35 51.33 3.77 | |
1: -4.94 10.78 32.93 18.91 19.04 4.19 25.55 -0.99 49.83 -43.36 | |
2: -46.66 -25.47 -7.17 -5.21 25.92 -28.97 -1.61 39.70 -46.76 2.28 | |
3: 36.44 27.68 14.00 -8.70 -65.46 -27.44 -16.22 -7.57 -25.19 15.33 | |
4: 20.81 -28.27 2.27 -3.35 -39.18 45.26 61.12 6.33 22.10 52.96 | |
5: 4.61 -26.74 -22.50 -60.72 -55.70 30.51 -79.62 -65.88 -27.66 -48.27 | |
6: 20.29 30.13 6.10 -48.29 16.57 26.22 -11.45 -40.89 -18.56 -45.27 | |
7: -44.84 -24.01 -65.82 -22.11 56.58 -39.31 -55.06 -56.69 -18.11 -19.13 | |
8: -27.09 13.39 -33.57 9.84 22.20 -4.47 -14.50 41.10 -23.99 32.84 | |
9: 18.91 50.44 1.44 -0.05 23.75 -31.49 46.10 -47.68 3.23 -11.28 | |
TEST F16_F32_ALIGNED_S m=4096 n=49 k=11008 batch=2 split_k=4 matmul 1879.89ms avg_err=0.151988 | |
m = 1 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 0.84 59.72 33.15 -25.85 18.08 -1.96 -46.14 28.61 23.56 26.15 | |
1: 24.58 18.37 -0.70 81.09 -36.63 19.65 -24.91 -55.25 -6.05 -23.64 | |
2: -24.40 -22.22 -32.73 39.08 -4.06 -13.45 -57.33 17.28 19.27 -32.60 | |
3: -63.76 37.80 -28.45 0.37 -9.87 -1.44 -1.01 -30.65 -9.08 -24.28 | |
4: 111.39 65.71 1.45 13.05 -5.12 19.83 4.05 -48.46 7.81 -10.44 | |
5: 49.53 6.06 3.98 -16.93 -4.00 24.22 61.05 -40.55 -9.34 46.69 | |
6: -18.85 -22.00 -20.53 23.31 25.49 71.57 53.82 -39.00 -36.72 -31.70 | |
7: -107.16 9.29 0.34 39.88 38.98 52.96 -65.82 -36.47 -6.06 69.59 | |
8: 7.79 -30.47 -29.58 -38.56 -49.71 -5.86 12.88 15.89 -71.26 20.62 | |
9: -1.65 53.81 91.36 -20.59 23.76 15.60 -21.32 -55.99 24.00 -4.60 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 0.89 59.88 32.99 -25.84 18.16 -2.03 -46.19 28.39 23.60 26.23 | |
1: 24.52 18.48 -0.76 80.97 -36.63 19.56 -24.75 -55.21 -6.06 -23.59 | |
2: -24.41 -22.02 -32.81 39.14 -3.94 -13.38 -57.32 17.46 19.49 -32.42 | |
3: -63.90 37.72 -28.31 0.32 -9.87 -1.63 -1.07 -30.43 -9.18 -24.31 | |
4: 111.34 65.70 1.49 12.99 -5.18 19.92 3.95 -48.48 7.75 -10.47 | |
5: 49.36 5.96 3.98 -17.16 -3.84 24.18 60.89 -40.60 -9.38 46.68 | |
6: -18.77 -21.96 -20.46 23.11 25.63 71.66 53.86 -38.96 -36.74 -31.64 | |
7: -107.16 9.32 0.27 39.91 38.89 52.83 -65.87 -36.37 -5.97 69.71 | |
8: 7.83 -30.62 -29.65 -38.71 -49.67 -5.78 13.07 15.98 -71.33 20.70 | |
9: -1.69 53.76 91.40 -20.50 23.69 15.73 -21.37 -56.02 24.12 -4.54 | |
d_buf0: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 1.95 13.06 25.68 15.93 1.43 -4.23 -17.75 5.81 16.03 24.96 | |
1: 20.14 1.68 -2.93 33.46 5.09 -2.60 0.94 -4.68 -20.98 -10.71 | |
2: -7.26 4.87 -18.44 14.40 9.26 10.67 -16.97 26.20 -3.91 -14.98 | |
3: -4.56 -11.85 1.26 -4.24 -2.16 22.28 5.81 -30.90 17.51 18.24 | |
4: 29.77 -4.91 -6.01 2.19 0.77 2.37 -6.63 22.31 9.10 -49.17 | |
5: 28.37 6.01 1.51 2.18 -8.09 3.02 19.38 1.94 -17.84 34.26 | |
6: -11.76 -15.42 -6.01 6.78 23.50 20.43 22.21 18.98 18.42 -7.77 | |
7: -14.72 -7.46 3.23 -7.59 10.66 6.71 9.21 -19.05 -15.64 22.16 | |
8: 46.30 8.70 -7.31 -2.52 -9.00 -16.95 -5.29 12.62 -33.91 7.18 | |
9: 6.24 23.62 32.65 -4.72 18.27 3.55 -5.10 -17.02 -19.38 -8.05 | |
d_buf1: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 2.14 15.77 24.94 -7.75 -7.54 -0.92 9.27 10.17 5.05 -7.51 | |
1: 2.04 -7.38 -20.01 13.16 1.89 -9.35 5.69 -38.49 14.09 -21.29 | |
2: 21.52 -1.13 -15.08 11.79 -22.10 -37.29 -1.38 -19.26 25.98 30.61 | |
3: -30.70 24.91 -12.34 -1.97 -0.58 -23.87 7.15 13.17 -18.74 -6.73 | |
4: 33.14 26.68 11.17 -25.07 -12.53 14.44 21.74 -18.94 -5.33 -0.96 | |
5: 12.86 14.31 -2.77 19.11 25.53 -2.47 26.71 -6.27 -11.69 -10.23 | |
6: 4.03 -5.27 17.53 8.31 -6.24 16.84 14.20 -34.33 -41.03 -4.95 | |
7: -42.00 22.24 18.62 12.75 0.62 40.27 -13.45 -13.95 -5.46 14.92 | |
8: -12.65 6.62 -0.92 -5.18 -28.66 9.87 -15.90 14.71 -13.12 8.75 | |
9: 12.68 -13.84 30.32 -7.01 -8.08 16.38 -1.44 -5.07 40.97 11.82 | |
d_buf2: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 5.98 4.83 -23.75 -19.56 21.95 11.48 -22.64 34.51 12.71 10.40 | |
1: 1.41 25.72 5.71 16.64 -1.22 9.35 -3.75 -26.43 -7.64 4.90 | |
2: -7.94 -27.00 3.40 10.16 -6.95 -10.29 -13.17 -6.20 -4.91 -29.95 | |
3: -9.89 11.14 -4.50 16.62 -9.64 -11.93 -8.06 -4.78 4.72 -23.25 | |
4: 10.76 24.09 8.60 7.07 2.21 0.94 -27.31 -17.34 -23.49 40.17 | |
5: 8.53 -17.51 14.35 -12.65 4.97 8.78 -13.76 -15.07 9.16 16.25 | |
6: -17.74 -0.44 -6.76 10.84 -9.71 19.47 4.77 -8.68 -12.20 5.50 | |
7: -26.78 -9.94 -2.61 10.36 7.16 -0.10 -40.58 -12.74 13.88 41.42 | |
8: 2.27 -41.18 19.69 -11.18 -0.78 -0.55 15.56 6.29 -8.59 -8.59 | |
9: -32.69 14.41 29.96 -4.42 6.78 -14.12 -0.12 -10.74 -10.55 -0.81 | |
d_buf3: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -9.23 26.07 6.28 -14.48 2.24 -8.28 -15.02 -21.87 -10.24 -1.69 | |
1: 0.99 -1.65 16.53 17.84 -42.39 22.25 -27.79 14.36 8.49 3.47 | |
2: -30.71 1.04 -2.62 2.72 15.73 23.46 -25.81 16.54 2.11 -18.29 | |
3: -18.61 13.61 -12.87 -10.05 2.52 12.09 -5.91 -8.14 -12.57 -12.54 | |
4: 37.71 19.86 -12.31 28.86 4.44 2.08 16.25 -34.49 27.53 -0.48 | |
5: -0.23 3.26 -9.10 -25.58 -26.41 14.89 28.73 -21.15 11.02 6.41 | |
6: 6.62 -0.88 -25.29 -2.62 17.94 14.84 12.65 -14.97 -1.92 -24.48 | |
7: -23.66 4.44 -18.89 24.35 20.53 6.07 -21.01 9.27 1.16 -8.90 | |
8: -28.14 -4.61 -41.04 -19.69 -11.26 1.76 18.51 -17.73 -15.65 13.28 | |
9: 12.12 29.61 -1.57 -4.44 6.79 9.78 -14.66 -23.16 12.95 -7.56 | |
TEST F16_F32_ALIGNED_M m=4096 n=49 k=11008 batch=2 split_k=4 matmul 1615.44ms avg_err=0.152401 | |
m = 1 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 14.16 25.15 17.97 60.53 -12.57 -19.12 10.56 -11.38 -47.53 -21.64 | |
1: -46.32 -23.00 -25.88 -40.11 -34.16 -26.81 15.30 44.35 -40.98 9.35 | |
2: -21.85 11.17 11.81 2.29 48.02 -21.45 -16.87 -3.18 -44.34 32.72 | |
3: -12.60 -7.02 -24.98 -61.65 66.78 16.39 4.15 23.98 20.56 21.16 | |
4: -11.18 2.08 34.95 -18.34 28.17 33.02 -14.95 3.33 88.05 -23.43 | |
5: 15.37 66.59 -25.00 -13.04 0.77 18.66 -9.04 28.31 38.55 -16.93 | |
6: -75.69 -66.88 14.44 19.18 18.38 21.85 91.97 23.13 5.26 -31.82 | |
7: -16.17 -37.56 0.55 -48.07 -1.17 14.63 -16.84 -1.78 -16.22 35.86 | |
8: 8.34 -47.77 6.87 -10.17 -8.97 -11.24 12.37 -62.45 1.93 -0.22 | |
9: -25.11 -36.07 -32.79 -38.95 7.19 35.71 -7.20 -16.81 26.67 42.38 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 14.16 25.14 17.85 60.46 -12.57 -19.28 10.70 -11.38 -47.52 -21.50 | |
1: -46.38 -23.02 -25.85 -39.95 -34.33 -26.84 15.26 44.44 -41.11 9.50 | |
2: -21.99 11.37 12.05 2.39 48.13 -21.51 -16.80 -3.20 -44.23 32.74 | |
3: -12.40 -6.89 -24.98 -61.53 66.83 16.29 4.27 23.99 20.68 21.13 | |
4: -11.30 2.10 34.98 -18.13 28.14 33.23 -15.02 3.41 87.92 -23.37 | |
5: 15.51 66.59 -25.01 -12.95 0.75 18.49 -9.12 28.27 38.48 -17.02 | |
6: -75.82 -66.88 14.60 19.13 18.33 21.87 91.91 23.12 5.13 -31.87 | |
7: -16.16 -37.79 0.66 -48.00 -1.04 14.48 -16.79 -1.80 -16.26 35.61 | |
8: 8.34 -47.86 6.85 -10.16 -8.92 -11.28 12.36 -62.41 1.82 -0.20 | |
9: -25.18 -35.97 -32.82 -38.75 7.12 35.61 -7.29 -16.79 26.78 42.36 | |
d_buf0: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 23.01 6.90 5.22 29.79 -13.19 -11.27 -14.43 -15.63 -23.62 -30.90 | |
1: -33.71 12.37 -38.73 3.58 8.84 -8.43 15.33 -6.92 -21.97 -15.70 | |
2: 7.36 -17.76 4.45 10.12 15.95 -5.29 5.35 -5.40 -23.93 -16.01 | |
3: -6.87 -17.01 -1.32 -10.17 25.25 -36.90 20.48 17.15 -18.99 7.55 | |
4: -11.20 -15.85 21.41 -20.97 4.56 13.09 -20.59 6.86 46.13 -17.71 | |
5: 5.67 14.79 10.86 -2.16 -22.38 15.02 -4.15 15.16 24.32 -25.19 | |
6: -16.07 4.39 5.20 13.69 -0.97 0.28 2.79 -5.23 27.31 2.12 | |
7: 1.83 -1.12 5.54 -25.56 2.54 27.24 4.49 3.75 7.52 2.84 | |
8: -0.32 17.49 25.71 -35.90 19.31 18.01 -12.41 -21.45 -1.41 -23.81 | |
9: -32.56 -30.25 -23.75 -7.06 3.59 5.45 -12.61 8.09 -0.45 5.51 | |
d_buf1: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 18.65 1.60 7.18 43.56 26.29 -20.86 9.40 26.58 -32.58 19.31 | |
1: -11.21 -4.99 1.50 -13.84 -5.14 10.04 4.07 12.69 5.46 7.16 | |
2: -17.63 7.39 -16.13 -23.81 -5.33 -29.82 -9.97 -24.04 -3.43 25.69 | |
3: 11.75 19.39 -20.76 -13.58 22.62 16.16 15.39 -3.21 23.74 11.26 | |
4: 16.76 18.31 -0.26 -0.64 -9.43 -1.54 11.24 -30.66 16.02 -13.36 | |
5: 4.60 17.18 -6.68 1.56 16.06 -21.65 -8.87 -4.46 12.96 3.38 | |
6: -26.38 -26.28 -1.41 -10.93 20.90 16.62 27.20 23.54 -29.32 -5.40 | |
7: -13.86 2.86 25.43 -2.10 -0.07 -30.27 -11.93 8.09 16.35 20.80 | |
8: -16.78 -28.77 -2.96 19.96 10.98 -3.64 18.56 -3.92 -12.43 6.14 | |
9: 11.22 6.24 -11.40 -7.29 -3.52 -19.77 27.53 -6.03 4.85 8.31 | |
d_buf2: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -38.12 32.82 2.25 -0.64 -9.00 -27.51 16.22 -0.84 -14.17 15.00 | |
1: 4.70 -21.39 16.62 -2.74 -36.02 -6.79 8.53 32.97 -5.22 14.24 | |
2: -4.49 6.38 -0.59 11.40 37.17 20.52 5.92 -3.35 7.89 -2.12 | |
3: -11.92 -13.83 4.14 7.49 -15.20 32.59 -24.64 20.44 9.79 -16.86 | |
4: -11.50 14.68 21.54 -0.98 32.22 27.64 4.15 29.06 28.38 18.30 | |
5: -30.68 5.38 -12.84 -7.06 22.66 13.61 -33.61 1.59 -2.44 -8.68 | |
6: 0.63 -8.65 -31.65 24.00 5.80 3.91 40.92 -0.12 -17.48 -15.99 | |
7: -12.47 -30.37 -34.91 15.40 -7.73 8.14 -8.68 6.81 -25.88 22.06 | |
8: 21.53 -16.97 -6.61 -9.19 -17.91 -2.14 -24.50 -44.10 12.20 4.04 | |
9: -14.25 -12.56 3.92 -16.65 17.10 42.78 18.86 -8.44 9.08 27.34 | |
d_buf3: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 10.63 -16.17 3.31 -12.17 -16.67 40.52 -0.62 -21.49 22.84 -25.05 | |
1: -6.09 -8.99 -5.26 -27.11 -1.85 -21.63 -12.63 5.61 -19.24 3.66 | |
2: -7.09 15.17 24.08 4.58 0.22 -6.87 -18.17 29.62 -24.88 25.16 | |
3: -5.56 4.43 -7.04 -45.38 34.11 4.55 -7.07 -10.40 6.03 19.21 | |
4: -5.23 -15.06 -7.74 4.26 0.82 -6.17 -9.75 -1.94 -2.47 -10.66 | |
5: 35.77 29.25 -16.34 -5.38 -15.57 11.67 37.59 16.01 3.71 13.55 | |
6: -33.87 -36.35 42.31 -7.58 -7.35 1.04 21.07 4.93 24.75 -12.54 | |
7: 8.33 -8.92 4.49 -35.81 4.09 9.52 -0.72 -20.43 -14.20 -9.84 | |
8: 3.91 -19.53 -9.26 14.95 -21.35 -23.47 30.72 7.01 3.57 13.41 | |
9: 10.48 0.50 -1.56 -7.95 -9.99 7.24 -40.98 -10.43 13.19 1.22 | |
TEST F16_F32_ALIGNED_L m=4096 n=49 k=11008 batch=2 split_k=4 matmul 3733.57ms avg_err=0.152055 | |
m = 0 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -24.69 8.03 5.51 -47.71 -1.26 -34.13 3.69 -53.05 15.17 -49.21 | |
1: 60.76 2.57 -10.13 5.55 46.11 -15.11 -9.72 -31.94 34.46 0.14 | |
2: -29.87 -10.36 8.06 -37.54 -9.78 59.61 -23.92 -18.32 16.11 -15.52 | |
3: -51.67 -15.43 19.76 35.96 -12.55 24.27 -26.17 44.77 -18.09 -40.85 | |
4: 21.84 -24.96 17.94 -11.73 -0.52 -13.68 -4.62 -3.68 8.20 9.28 | |
5: 9.50 29.46 -46.96 -39.43 16.75 -5.47 -39.21 -46.67 -22.32 10.79 | |
6: -18.57 -1.50 24.23 56.67 -12.62 35.64 43.25 27.21 0.23 -15.34 | |
7: -3.69 65.55 8.78 -61.79 12.08 1.62 -15.16 56.32 4.81 1.81 | |
8: 36.93 -17.19 -26.45 40.27 -34.80 -10.54 25.25 34.02 28.96 48.88 | |
9: 24.14 44.77 -0.60 -8.83 -33.34 22.59 18.77 47.23 3.83 58.32 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -24.82 8.06 5.44 -47.77 -1.31 -33.86 3.72 -52.98 15.23 -49.09 | |
1: 60.73 2.77 -10.05 5.59 46.11 -15.03 -9.52 -31.80 34.43 0.06 | |
2: -29.96 -10.40 8.05 -37.33 -9.81 59.81 -23.92 -18.25 15.96 -15.62 | |
3: -51.72 -15.29 19.62 35.88 -12.76 24.25 -26.11 44.74 -18.09 -40.82 | |
4: 21.87 -25.05 17.81 -11.55 -0.48 -13.66 -4.65 -3.71 8.25 9.37 | |
5: 9.51 29.52 -47.01 -39.72 16.70 -5.45 -39.20 -46.75 -22.24 10.78 | |
6: -18.53 -1.45 24.29 56.59 -12.60 35.60 43.16 27.12 0.55 -15.13 | |
7: -3.63 65.41 8.65 -61.98 12.05 1.66 -15.29 56.31 4.72 1.84 | |
8: 37.08 -17.28 -26.46 40.35 -34.79 -10.50 25.21 34.00 28.94 48.98 | |
9: 24.07 44.81 -0.58 -8.74 -33.33 22.66 18.84 47.19 3.80 58.32 | |
d_buf0: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -10.93 5.29 -2.31 -48.62 0.32 -14.44 30.55 -12.20 -1.86 10.26 | |
1: 10.72 -7.28 8.88 -5.16 1.78 -12.22 -17.52 12.90 18.29 2.35 | |
2: 1.15 -18.17 -1.07 5.64 -37.89 30.01 -22.82 3.18 13.09 26.69 | |
3: -34.25 -24.88 30.10 2.24 6.05 4.92 1.99 33.54 1.58 -18.49 | |
4: -23.93 -4.30 -7.34 5.09 3.85 -3.07 4.92 19.50 -12.38 -3.19 | |
5: -11.64 17.91 2.71 -15.29 23.02 -0.59 -1.02 8.14 -3.01 -4.68 | |
6: 20.06 -24.27 17.63 -5.09 -0.93 1.07 8.99 20.52 -12.91 -14.65 | |
7: 20.82 16.73 9.31 -28.25 1.49 -25.18 14.01 6.43 -3.72 -7.91 | |
8: 2.78 -21.18 -14.48 4.46 -2.76 -5.56 7.97 22.89 25.39 2.52 | |
9: -1.25 21.65 -0.40 -20.85 -8.72 21.09 13.49 21.71 0.28 5.36 | |
d_buf1: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -17.85 15.45 29.83 -11.38 5.79 15.61 19.15 9.98 -11.49 -27.98 | |
1: 1.13 -0.55 -26.42 10.60 -11.73 -4.17 -7.25 -15.15 6.17 -12.38 | |
2: -16.42 3.21 -7.02 7.89 13.62 12.12 -3.95 -11.61 8.19 -12.24 | |
3: 22.70 -11.12 -3.97 -2.29 18.98 8.21 -10.20 12.04 0.70 -16.66 | |
4: 21.97 -26.24 -10.46 21.95 12.44 -12.76 -15.71 -15.82 15.88 -8.55 | |
5: -7.13 14.93 -24.84 -25.34 -7.92 18.41 -23.20 1.08 -17.93 6.16 | |
6: -2.36 34.30 -16.29 33.25 14.15 5.20 -14.69 -3.12 25.02 10.19 | |
7: -3.51 19.77 -2.87 -23.96 -4.40 6.66 10.74 3.28 15.41 22.75 | |
8: 0.27 24.76 -3.65 5.38 -10.45 -1.91 -0.84 4.80 -3.05 2.50 | |
9: 9.77 22.89 -0.84 -3.66 16.54 -5.05 10.40 17.90 17.88 13.88 | |
d_buf2: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 3.82 8.53 -4.38 19.08 -13.52 -20.72 -8.40 -36.05 23.39 -27.46 | |
1: 31.15 2.84 24.74 10.01 40.84 -20.42 12.50 -4.05 -9.42 23.00 | |
2: -20.65 -11.29 25.55 -31.45 16.72 4.83 -6.82 3.89 1.01 -37.41 | |
3: -18.72 7.12 -16.33 31.78 -9.73 23.60 -13.09 -32.35 -2.36 0.99 | |
4: 23.95 17.44 5.02 -24.91 -25.48 -0.56 -14.92 -2.15 9.04 0.80 | |
5: 11.50 -20.21 -7.36 -2.30 -5.44 0.99 -27.50 -13.45 4.55 -1.81 | |
6: -30.59 -22.08 6.04 22.86 -19.77 22.61 28.22 24.19 -23.38 2.07 | |
7: -26.95 1.15 -4.19 -16.59 -1.65 19.18 -14.93 23.67 -2.68 -19.23 | |
8: -13.63 -9.89 -10.49 8.17 -7.38 17.51 3.24 -17.26 -14.58 30.00 | |
9: 38.16 6.26 -32.96 20.34 -17.18 30.02 13.20 -20.76 -26.26 1.31 | |
d_buf3: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 0.28 -21.25 -17.63 -6.78 6.15 -14.58 -37.60 -14.78 5.14 -4.03 | |
1: 17.76 7.55 -17.32 -9.89 15.22 21.70 2.56 -25.64 19.43 -12.84 | |
2: 6.05 15.89 -9.39 -19.62 -2.24 12.65 9.67 -13.79 -6.18 7.44 | |
3: -21.40 13.45 9.97 4.24 -27.86 -12.46 -4.88 31.55 -18.01 -6.68 | |
4: -0.16 -11.86 30.73 -13.87 8.66 2.71 21.08 -5.22 -4.34 20.22 | |
5: 16.77 16.84 -17.47 3.49 7.09 -24.29 12.50 -42.44 -5.93 11.13 | |
6: -5.68 10.55 16.84 5.64 -6.08 6.76 20.73 -14.38 11.51 -12.96 | |
7: 5.96 27.90 6.53 7.02 16.63 0.97 -24.98 22.93 -4.20 6.19 | |
8: 47.52 -10.88 2.17 22.26 -14.21 -20.57 14.88 23.59 21.20 13.86 | |
9: -22.54 -6.03 33.59 -4.66 -23.98 -23.48 -18.33 28.37 11.93 37.77 | |
TEST F16_F32_ALIGNED_S m=32000 n=49 k=4096 batch=2 split_k=1 matmul 5018.51ms avg_err=0.0572886 | |
TEST F16_F32_ALIGNED_M m=32000 n=49 k=4096 batch=2 split_k=1 matmul 4478.95ms avg_err=0.0573242 | |
TEST F16_F32_ALIGNED_L m=32000 n=49 k=4096 batch=2 split_k=1 matmul 10702.9ms avg_err=0.0574044 | |
TEST F16_F32_ALIGNED_S m=32000 n=49 k=4096 batch=2 split_k=4 matmul 4694ms avg_err=0.0573665 | |
TEST F16_F32_ALIGNED_M m=32000 n=49 k=4096 batch=2 split_k=4 matmul 4240.19ms avg_err=0.0573169 | |
TEST F16_F32_ALIGNED_L m=32000 n=49 k=4096 batch=2 split_k=4 matmul 11029.5ms avg_err=0.0574001 | |
TEST F16_F32_ALIGNED_S m=512 n=512 k=128 batch=2 split_k=1 matmul 30.416ms avg_err=0.00271696 | |
TEST F16_F32_ALIGNED_M m=512 n=512 k=128 batch=2 split_k=1 matmul 23.762ms avg_err=0.00273248 | |
TEST F16_F32_ALIGNED_L m=512 n=512 k=128 batch=2 split_k=1 matmul 32.387ms avg_err=0.00272195 | |
TEST F16_F32_ALIGNED_S m=512 n=512 k=128 batch=2 split_k=4 matmul 29.809ms avg_err=0.00272533 | |
TEST F16_F32_ALIGNED_M m=512 n=512 k=128 batch=2 split_k=4 matmul 24.777ms avg_err=0.00273139 | |
TEST F16_F32_ALIGNED_L m=512 n=512 k=128 batch=2 split_k=4 matmul 41.608ms avg_err=0.00271922 | |
TEST F16_F32_ALIGNED_S m=128 n=512 k=512 batch=2 split_k=1 matmul 58.361ms avg_err=0.00815539 | |
TEST F16_F32_ALIGNED_M m=128 n=512 k=512 batch=2 split_k=1 matmul 18.179ms avg_err=0.00819759 | |
TEST F16_F32_ALIGNED_L m=128 n=512 k=512 batch=2 split_k=1 matmul 61.453ms avg_err=0.00816776 | |
TEST F16_F32_ALIGNED_S m=128 n=512 k=512 batch=2 split_k=4 matmul 26.532ms avg_err=0.00822431 | |
TEST F16_F32_ALIGNED_M m=128 n=512 k=512 batch=2 split_k=4 matmul 30.825ms avg_err=0.0082069 | |
TEST F16_F32_ALIGNED_L m=128 n=512 k=512 batch=2 split_k=4 matmul 40.268ms avg_err=0.00821236 | |
TEST F16_F32_ALIGNED_S m=4096 n=512 k=4096 batch=2 split_k=1 matmul 5277.95ms avg_err=0.0573358 | |
TEST F16_F32_ALIGNED_M m=4096 n=512 k=4096 batch=2 split_k=1 matmul 4569.04ms avg_err=0.0573388 | |
TEST F16_F32_ALIGNED_L m=4096 n=512 k=4096 batch=2 split_k=1 matmul 5748.63ms avg_err=0.0573721 | |
TEST F16_F32_ALIGNED_S m=4096 n=512 k=4096 batch=2 split_k=4 matmul 4726.71ms avg_err=0.0573418 | |
TEST F16_F32_ALIGNED_M m=4096 n=512 k=4096 batch=2 split_k=4 matmul 4503.21ms avg_err=0.0573443 | |
TEST F16_F32_ALIGNED_L m=4096 n=512 k=4096 batch=2 split_k=4 matmul 5912.87ms avg_err=0.0572939 | |
TEST F16_F32_ALIGNED_S m=11008 n=512 k=4096 batch=2 split_k=1 matmul 13501.3ms avg_err=0.0573719 | |
TEST F16_F32_ALIGNED_M m=11008 n=512 k=4096 batch=2 split_k=1 matmul 13548.5ms avg_err=0.0573067 | |
TEST F16_F32_ALIGNED_L m=11008 n=512 k=4096 batch=2 split_k=1 matmul 15782.7ms avg_err=0.0573335 | |
TEST F16_F32_ALIGNED_S m=11008 n=512 k=4096 batch=2 split_k=4 matmul 13712.4ms avg_err=0.0573502 | |
TEST F16_F32_ALIGNED_M m=11008 n=512 k=4096 batch=2 split_k=4 matmul 13128.6ms avg_err=0.0573654 | |
TEST F16_F32_ALIGNED_L m=11008 n=512 k=4096 batch=2 split_k=4 matmul 15848.6ms avg_err=0.0573447 | |
TEST F16_F32_ALIGNED_S m=4096 n=512 k=11008 batch=2 split_k=1 matmul 14262.1ms avg_err=0.152003 | |
m = 1 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 41.30 -32.79 -11.27 11.53 18.27 30.16 77.38 39.32 67.36 16.98 | |
1: -1.27 48.66 -22.86 -17.40 -21.20 -38.58 -15.36 -65.66 -21.91 44.02 | |
2: 37.46 2.89 -2.35 20.38 -18.79 2.11 -11.16 -58.82 -1.29 38.14 | |
3: 38.80 37.97 -62.69 14.97 21.82 34.34 35.93 14.82 35.10 -12.02 | |
4: -23.31 45.17 -6.56 -17.08 -14.62 92.72 -45.27 36.13 22.57 54.48 | |
5: -7.03 23.27 14.67 3.31 63.67 -18.79 55.68 -29.59 -30.33 -1.68 | |
6: -49.97 34.74 4.48 -26.72 -33.70 5.50 -12.80 -31.37 -13.09 -5.13 | |
7: 40.28 -10.36 41.06 30.42 32.97 -50.52 -48.48 -51.08 25.32 32.34 | |
8: -6.20 37.63 -8.58 -31.15 59.62 -39.66 -53.12 30.64 47.74 10.00 | |
9: 13.26 -9.41 15.52 -30.46 -45.47 31.62 83.08 -27.60 -113.58 63.39 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: 41.32 -32.92 -11.36 11.62 18.35 30.04 77.41 39.25 67.54 17.06 | |
1: -1.16 48.54 -22.96 -17.41 -21.17 -38.72 -15.49 -65.64 -21.81 44.01 | |
2: 37.39 3.00 -2.38 20.36 -18.74 2.10 -11.22 -58.92 -1.30 38.13 | |
3: 38.75 37.96 -62.60 14.90 21.82 34.26 36.01 14.94 35.08 -12.14 | |
4: -23.31 45.21 -6.48 -17.05 -14.80 92.74 -45.26 36.24 22.55 54.46 | |
5: -7.01 23.34 14.65 3.20 63.75 -18.88 55.51 -29.60 -30.26 -1.52 | |
6: -50.05 34.60 4.58 -26.74 -33.88 5.68 -12.89 -31.47 -12.99 -5.24 | |
7: 40.22 -10.46 41.14 30.42 32.93 -50.52 -48.38 -51.17 25.44 32.30 | |
8: -6.06 37.65 -8.70 -31.00 59.55 -39.71 -53.34 30.61 47.54 10.03 | |
9: 13.14 -9.37 15.56 -30.58 -45.31 31.60 83.06 -27.69 -113.63 63.31 | |
TEST F16_F32_ALIGNED_M m=4096 n=512 k=11008 batch=2 split_k=1 matmul 14126.9ms avg_err=0.152033 | |
m = 0 n = 0 b = 0 | |
Actual result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -34.23 17.63 25.72 -12.34 -6.17 37.62 22.38 44.94 9.61 28.75 | |
1: 33.47 19.43 -40.65 -18.83 -42.43 51.59 -28.94 -32.01 -13.05 -54.67 | |
2: 48.77 -10.30 -28.15 -28.62 -36.81 66.43 4.76 4.33 3.08 -45.42 | |
3: 27.46 62.00 -2.90 -67.27 -19.34 4.46 -4.95 -27.09 -27.28 -17.32 | |
4: -18.72 95.68 -24.75 34.37 -6.60 -7.37 10.65 32.25 35.97 -23.84 | |
5: 17.04 -49.14 21.08 38.85 -25.35 -38.08 11.18 57.04 3.94 -33.91 | |
6: -12.63 28.01 61.87 -9.34 -20.86 -15.93 -12.25 44.43 14.71 17.13 | |
7: -14.32 -2.14 6.71 -3.49 9.49 -23.57 -59.02 41.83 -23.21 -4.38 | |
8: -24.32 1.44 -38.19 20.90 36.71 35.98 -23.29 22.06 47.49 -13.56 | |
9: 11.35 -78.87 -17.03 -87.58 -6.46 -38.65 -70.39 -27.03 -33.60 42.85 | |
Expected result: | |
0 1 2 3 4 5 6 7 8 9 | |
0: -34.16 17.61 25.76 -12.41 -6.14 37.65 22.45 44.95 9.60 28.71 | |
1: 33.42 19.35 -40.43 -18.87 -42.47 51.40 -28.75 -31.97 -13.12 -54.72 | |
2: 48.77 -10.29 -28.07 -28.65 -36.78 66.32 4.82 4.18 3.06 -45.48 | |
3: 27.55 62.03 -2.76 -67.30 -19.43 4.53 -4.97 -27.10 -27.25 -17.20 | |
4: -18.75 95.70 -24.73 34.33 -6.57 -7.48 10.57 31.96 35.88 -23.90 | |
5: 17.15 -49.04 21.18 38.93 -25.43 -38.01 11.16 57.08 4.00 -33.92 | |
6: -12.59 27.94 61.77 -9.34 -20.76 -15.82 -12.30 44.46 14.63 17.14 | |
7: -14.39 -2.08 6.69 -3.33 9.57 -23.53 -59.05 41.79 -23.25 -4.40 | |
8: -24.28 1.33 -38.18 20.88 36.66 35.92 -23.31 22.01 47.46 -13.69 | |
9: 11.29 -78.98 -16.98 -87.63 -6.50 -38.86 -70.45 -27.11 -33.58 42.83 | |
libc++abi: terminating due to uncaught exception of type vk::DeviceLostError: vk::Device::waitForFences: ErrorDeviceLost |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment