Skip to content

Instantly share code, notes, and snippets.

@av
Created September 12, 2024 16:22
Harbor bench - engines recipe
#!/bin/bash
# Note that you're not expected to run this
# file as is in one go
OPENROUTER_KEY=<your_openrouter_key>
TASKS=<path_to_tasks_file>
NAME=engbench
# # Install Harbor
# curl https://av.codes/get-harbor.sh | bash
# # Pre-pull/build relevant images
# h pull ollama
# h pull llamacpp
# h pull aphrodite
# h pull mistralrs
# h pull tabbyapi
# h pull vllm
# h build bench
# Common
h bench judge meta-llama/llama-3.1-70b-instruct
h bench judge_api https://openrouter.ai/api
h bench judge_key $OPENROUTER_KEY
h bench tasks $TASKS
h config set bench.parallel 4
# ===== Reference tests
# # Lama 3.1 8B
h bench model meta-llama/llama-3.1-8b-instruct
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-llama3.1-8b-${NAME}
# Lama 3.1 70B
h bench model meta-llama/llama-3.1-70b-instruct
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-llama3.1-70b-${NAME}
# # Sonnet 3.5
h bench model anthropic/claude-3.5-sonnet
h bench variants --temperature 0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-claude-3.5-sonnet-${NAME}
# ===== Local tests
# Ollama q2_K
h bench model llama3.1:8b-instruct-q2_K
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q2_K-${NAME}
# # Ollama q4_0
h bench model llama3.1:8b-instruct-q4_0
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q4_0-${NAME}
# # Ollama q6_K
h bench model llama3.1:8b-instruct-q6_K
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q6_K-${NAME}
# # Ollama q8_0
h bench model llama3.1:8b-instruct-q8_0
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q8_0-${NAME}
# # Ollama fp16
h bench model llama3.1:8b-instruct-fp16
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-fp16-${NAME}
# Llama.cpp Q8_0
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
h bench api http://harbor.llamacpp:8080
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name llamacpp-llama3.1-8b-Q8_0-${NAME}
# Llama.cpp Q4_K_M
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
h bench api http://harbor.llamacpp:8080
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name llamacpp-llama3.1-8b-Q4_K_M-${NAME}
# Aphrodite
h aphrodite model meta-llama/Meta-Llama-3.1-8B-Instruct
h aphrodite args --quantization fp8 --max-model-len 2048
h bench model $(h aphrodite model)
h bench api $(h url -i aphrodite)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name aphrodite-llama3.1-8B-fp8-${NAME}
# Mistral.rs Q8_0
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q8_0
h mistralrs args --no-paged-attn
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ8_0-${NAME}
# Mistral.rs Q6K
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q6K
h mistralrs args --no-paged-attn --truncate-sequence
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ6K-${NAME}
# Mistral.rs Q4K
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q4K
h mistralrs args --no-paged-attn --truncate-sequence
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ4K-${NAME}
# TabbyAPI 8bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/77ba6c6d3e518d93aa4077433f52c8d73eddca44
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-8bpw-${NAME}
# TabbyAPI 6bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/ddb8769ce76c63fbd7e6baf4d0b2ffd8a1a221f9
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-6bpw-${NAME}
# TabbyAPI 4bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/cb83dde90c495cd21bd5cedd0eabf342aa602cbb
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-4bpw-${NAME}
# vLLM
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
h vllm args --quantization fp8 --max-model-len 2048
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
h bench api $(h url -i vllm)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name vllm-llama3.1-8B-fp8-${NAME}
# vLLM bnb
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
h vllm args --load-format bitsandbytes --quantization bitsandbytes --max-model-len 2048
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
h bench api $(h url -i vllm)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name vllm-llama3.1-8B-bnb-${NAME}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment