-
-
Save eric-unc/ded12b1f42f4b0a46361e9a1d34c983a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| xbony2@xbony2s-Mac-Pro ollama % go run . serve | |
| 2025/02/11 23:40:05 routes.go:1186: INFO server config env="map[HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/Users/xbony2/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://*] OLLAMA_SCHED_SPREAD:false http_proxy: https_proxy: no_proxy:]" | |
| time=2025-02-11T23:40:05.639-05:00 level=INFO source=images.go:432 msg="total blobs: 7" | |
| time=2025-02-11T23:40:05.639-05:00 level=INFO source=images.go:439 msg="total unused blobs removed: 0" | |
| [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached. | |
| [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production. | |
| - using env: export GIN_MODE=release | |
| - using code: gin.SetMode(gin.ReleaseMode) | |
| [GIN-debug] POST /api/pull --> github.com/ollama/ollama/server.(*Server).PullHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/generate --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/chat --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/embed --> github.com/ollama/ollama/server.(*Server).EmbedHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/embeddings --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/create --> github.com/ollama/ollama/server.(*Server).CreateHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/push --> github.com/ollama/ollama/server.(*Server).PushHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/copy --> github.com/ollama/ollama/server.(*Server).CopyHandler-fm (5 handlers) | |
| [GIN-debug] DELETE /api/delete --> github.com/ollama/ollama/server.(*Server).DeleteHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/show --> github.com/ollama/ollama/server.(*Server).ShowHandler-fm (5 handlers) | |
| [GIN-debug] POST /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers) | |
| [GIN-debug] HEAD /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers) | |
| [GIN-debug] GET /api/ps --> github.com/ollama/ollama/server.(*Server).PsHandler-fm (5 handlers) | |
| [GIN-debug] POST /v1/chat/completions --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers) | |
| [GIN-debug] POST /v1/completions --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (6 handlers) | |
| [GIN-debug] POST /v1/embeddings --> github.com/ollama/ollama/server.(*Server).EmbedHandler-fm (6 handlers) | |
| [GIN-debug] GET /v1/models --> github.com/ollama/ollama/server.(*Server).ListHandler-fm (6 handlers) | |
| [GIN-debug] GET /v1/models/:model --> github.com/ollama/ollama/server.(*Server).ShowHandler-fm (6 handlers) | |
| [GIN-debug] GET / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) | |
| [GIN-debug] GET /api/tags --> github.com/ollama/ollama/server.(*Server).ListHandler-fm (5 handlers) | |
| [GIN-debug] GET /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) | |
| [GIN-debug] HEAD / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) | |
| [GIN-debug] HEAD /api/tags --> github.com/ollama/ollama/server.(*Server).ListHandler-fm (5 handlers) | |
| [GIN-debug] HEAD /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) | |
| time=2025-02-11T23:40:05.640-05:00 level=INFO source=routes.go:1237 msg="Listening on 127.0.0.1:11434 (version 0.0.0)" | |
| time=2025-02-11T23:40:05.668-05:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=metal variant="" compute="" driver=0.0 name="" total="3.0 GiB" available="3.0 GiB" | |
| [GIN] 2025/02/11 - 23:40:14 | 200 | 85.703µs | 127.0.0.1 | HEAD "/" | |
| [GIN] 2025/02/11 - 23:40:14 | 200 | 28.003558ms | 127.0.0.1 | POST "/api/show" | |
| time=2025-02-11T23:40:15.028-05:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/Users/xbony2/.ollama/models/blobs/sha256-aabd4debf0c8f08881923f2c25fc0fdeed24435271c2b3e92c4af36704040dbc gpu=0 parallel=4 available=3221225472 required="2.0 GiB" | |
| time=2025-02-11T23:40:15.028-05:00 level=WARN source=gpu_darwin.go:81 msg="failed to discover physical CPU details" query=hw.perflevel0.physicalcpu error="no such file or directory" | |
| time=2025-02-11T23:40:15.029-05:00 level=INFO source=server.go:100 msg="system memory" total="32.0 GiB" free="23.9 GiB" free_swap="0 B" | |
| time=2025-02-11T23:40:15.029-05:00 level=INFO source=memory.go:356 msg="offload to metal" layers.requested=-1 layers.model=29 layers.offload=29 layers.split="" memory.available="[3.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.0 GiB" memory.required.partial="2.0 GiB" memory.required.kv="224.0 MiB" memory.required.allocations="[2.0 GiB]" memory.weights.total="976.1 MiB" memory.weights.repeating="793.5 MiB" memory.weights.nonrepeating="182.6 MiB" memory.graph.full="299.8 MiB" memory.graph.partial="299.8 MiB" | |
| time=2025-02-11T23:40:15.031-05:00 level=INFO source=server.go:381 msg="starting llama server" cmd="/private/var/folders/jk/l7ft983x76dc9j6q7ls_dtr40000gn/T/go-build2680913291/b001/exe/ollama runner --model /Users/xbony2/.ollama/models/blobs/sha256-aabd4debf0c8f08881923f2c25fc0fdeed24435271c2b3e92c4af36704040dbc --ctx-size 8192 --batch-size 512 --n-gpu-layers 29 --parallel 4 --port 60419" | |
| time=2025-02-11T23:40:15.033-05:00 level=INFO source=sched.go:449 msg="loaded runners" count=1 | |
| time=2025-02-11T23:40:15.033-05:00 level=INFO source=server.go:558 msg="waiting for llama runner to start responding" | |
| time=2025-02-11T23:40:15.033-05:00 level=INFO source=server.go:592 msg="waiting for server to become available" status="llm server error" | |
| time=2025-02-11T23:40:15.078-05:00 level=INFO source=runner.go:936 msg="starting go runner" | |
| time=2025-02-11T23:40:15.078-05:00 level=INFO source=runner.go:937 msg=system info="CPU : SSE3 = 1 | SSSE3 = 1 | LLAMAFILE = 1 | CPU : SSE3 = 1 | SSSE3 = 1 | LLAMAFILE = 1 | cgo(clang)" threads=12 | |
| time=2025-02-11T23:40:15.078-05:00 level=INFO source=runner.go:995 msg="Server listening on 127.0.0.1:60419" | |
| llama_model_loader: loaded meta data with 26 key-value pairs and 339 tensors from /Users/xbony2/.ollama/models/blobs/sha256-aabd4debf0c8f08881923f2c25fc0fdeed24435271c2b3e92c4af36704040dbc (version GGUF V3 (latest)) | |
| llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
| llama_model_loader: - kv 0: general.architecture str = qwen2 | |
| llama_model_loader: - kv 1: general.type str = model | |
| llama_model_loader: - kv 2: general.name str = DeepSeek R1 Distill Qwen 1.5B | |
| llama_model_loader: - kv 3: general.basename str = DeepSeek-R1-Distill-Qwen | |
| llama_model_loader: - kv 4: general.size_label str = 1.5B | |
| llama_model_loader: - kv 5: qwen2.block_count u32 = 28 | |
| llama_model_loader: - kv 6: qwen2.context_length u32 = 131072 | |
| llama_model_loader: - kv 7: qwen2.embedding_length u32 = 1536 | |
| llama_model_loader: - kv 8: qwen2.feed_forward_length u32 = 8960 | |
| llama_model_loader: - kv 9: qwen2.attention.head_count u32 = 12 | |
| llama_model_loader: - kv 10: qwen2.attention.head_count_kv u32 = 2 | |
| llama_model_loader: - kv 11: qwen2.rope.freq_base f32 = 10000.000000 | |
| llama_model_loader: - kv 12: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 | |
| llama_model_loader: - kv 13: general.file_type u32 = 15 | |
| llama_model_loader: - kv 14: tokenizer.ggml.model str = gpt2 | |
| llama_model_loader: - kv 15: tokenizer.ggml.pre str = qwen2 | |
| llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... | |
| llama_model_loader: - kv 17: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
| llama_model_loader: - kv 18: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... | |
| llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 151646 | |
| llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 151643 | |
| llama_model_loader: - kv 21: tokenizer.ggml.padding_token_id u32 = 151643 | |
| llama_model_loader: - kv 22: tokenizer.ggml.add_bos_token bool = true | |
| llama_model_loader: - kv 23: tokenizer.ggml.add_eos_token bool = false | |
| llama_model_loader: - kv 24: tokenizer.chat_template str = {% if not add_generation_prompt is de... | |
| llama_model_loader: - kv 25: general.quantization_version u32 = 2 | |
| llama_model_loader: - type f32: 141 tensors | |
| llama_model_loader: - type q4_K: 169 tensors | |
| llama_model_loader: - type q6_K: 29 tensors | |
| time=2025-02-11T23:40:15.285-05:00 level=INFO source=server.go:592 msg="waiting for server to become available" status="llm server loading model" | |
| llm_load_vocab: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect | |
| llm_load_vocab: special tokens cache size = 22 | |
| llm_load_vocab: token to piece cache size = 0.9310 MB | |
| llm_load_print_meta: format = GGUF V3 (latest) | |
| llm_load_print_meta: arch = qwen2 | |
| llm_load_print_meta: vocab type = BPE | |
| llm_load_print_meta: n_vocab = 151936 | |
| llm_load_print_meta: n_merges = 151387 | |
| llm_load_print_meta: vocab_only = 0 | |
| llm_load_print_meta: n_ctx_train = 131072 | |
| llm_load_print_meta: n_embd = 1536 | |
| llm_load_print_meta: n_layer = 28 | |
| llm_load_print_meta: n_head = 12 | |
| llm_load_print_meta: n_head_kv = 2 | |
| llm_load_print_meta: n_rot = 128 | |
| llm_load_print_meta: n_swa = 0 | |
| llm_load_print_meta: n_embd_head_k = 128 | |
| llm_load_print_meta: n_embd_head_v = 128 | |
| llm_load_print_meta: n_gqa = 6 | |
| llm_load_print_meta: n_embd_k_gqa = 256 | |
| llm_load_print_meta: n_embd_v_gqa = 256 | |
| llm_load_print_meta: f_norm_eps = 0.0e+00 | |
| llm_load_print_meta: f_norm_rms_eps = 1.0e-06 | |
| llm_load_print_meta: f_clamp_kqv = 0.0e+00 | |
| llm_load_print_meta: f_max_alibi_bias = 0.0e+00 | |
| llm_load_print_meta: f_logit_scale = 0.0e+00 | |
| llm_load_print_meta: n_ff = 8960 | |
| llm_load_print_meta: n_expert = 0 | |
| llm_load_print_meta: n_expert_used = 0 | |
| llm_load_print_meta: causal attn = 1 | |
| llm_load_print_meta: pooling type = 0 | |
| llm_load_print_meta: rope type = 2 | |
| llm_load_print_meta: rope scaling = linear | |
| llm_load_print_meta: freq_base_train = 10000.0 | |
| llm_load_print_meta: freq_scale_train = 1 | |
| llm_load_print_meta: n_ctx_orig_yarn = 131072 | |
| llm_load_print_meta: rope_finetuned = unknown | |
| llm_load_print_meta: ssm_d_conv = 0 | |
| llm_load_print_meta: ssm_d_inner = 0 | |
| llm_load_print_meta: ssm_d_state = 0 | |
| llm_load_print_meta: ssm_dt_rank = 0 | |
| llm_load_print_meta: ssm_dt_b_c_rms = 0 | |
| llm_load_print_meta: model type = 1.5B | |
| llm_load_print_meta: model ftype = Q4_K - Medium | |
| llm_load_print_meta: model params = 1.78 B | |
| llm_load_print_meta: model size = 1.04 GiB (5.00 BPW) | |
| llm_load_print_meta: general.name = DeepSeek R1 Distill Qwen 1.5B | |
| llm_load_print_meta: BOS token = 151646 '<|begin▁of▁sentence|>' | |
| llm_load_print_meta: EOS token = 151643 '<|end▁of▁sentence|>' | |
| llm_load_print_meta: EOT token = 151643 '<|end▁of▁sentence|>' | |
| llm_load_print_meta: PAD token = 151643 '<|end▁of▁sentence|>' | |
| llm_load_print_meta: LF token = 148848 'ÄĬ' | |
| llm_load_print_meta: FIM PRE token = 151659 '<|fim_prefix|>' | |
| llm_load_print_meta: FIM SUF token = 151661 '<|fim_suffix|>' | |
| llm_load_print_meta: FIM MID token = 151660 '<|fim_middle|>' | |
| llm_load_print_meta: FIM PAD token = 151662 '<|fim_pad|>' | |
| llm_load_print_meta: FIM REP token = 151663 '<|repo_name|>' | |
| llm_load_print_meta: FIM SEP token = 151664 '<|file_sep|>' | |
| llm_load_print_meta: EOG token = 151643 '<|end▁of▁sentence|>' | |
| llm_load_print_meta: EOG token = 151662 '<|fim_pad|>' | |
| llm_load_print_meta: EOG token = 151663 '<|repo_name|>' | |
| llm_load_print_meta: EOG token = 151664 '<|file_sep|>' | |
| llm_load_print_meta: max token length = 256 | |
| llm_load_tensors: CPU_Mapped model buffer size = 1059.89 MiB | |
| llama_new_context_with_model: n_seq_max = 4 | |
| llama_new_context_with_model: n_ctx = 8192 | |
| llama_new_context_with_model: n_ctx_per_seq = 2048 | |
| llama_new_context_with_model: n_batch = 2048 | |
| llama_new_context_with_model: n_ubatch = 512 | |
| llama_new_context_with_model: flash_attn = 0 | |
| llama_new_context_with_model: freq_base = 10000.0 | |
| llama_new_context_with_model: freq_scale = 1 | |
| llama_new_context_with_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized | |
| llama_kv_cache_init: kv_size = 8192, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 28, can_shift = 1 | |
| llama_kv_cache_init: CPU KV buffer size = 224.00 MiB | |
| llama_new_context_with_model: KV self size = 224.00 MiB, K (f16): 112.00 MiB, V (f16): 112.00 MiB | |
| llama_new_context_with_model: CPU output buffer size = 2.34 MiB | |
| llama_new_context_with_model: CPU compute buffer size = 302.75 MiB | |
| llama_new_context_with_model: graph nodes = 986 | |
| llama_new_context_with_model: graph splits = 1 | |
| time=2025-02-11T23:40:15.795-05:00 level=INFO source=server.go:597 msg="llama runner started in 0.76 seconds" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment