Created
September 12, 2024 16:22
Harbor bench - engines recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Note that you're not expected to run this | |
# file as is in one go | |
OPENROUTER_KEY=<your_openrouter_key> | |
TASKS=<path_to_tasks_file> | |
NAME=engbench | |
# # Install Harbor | |
# curl https://av.codes/get-harbor.sh | bash | |
# # Pre-pull/build relevant images | |
# h pull ollama | |
# h pull llamacpp | |
# h pull aphrodite | |
# h pull mistralrs | |
# h pull tabbyapi | |
# h pull vllm | |
# h build bench | |
# Common | |
h bench judge meta-llama/llama-3.1-70b-instruct | |
h bench judge_api https://openrouter.ai/api | |
h bench judge_key $OPENROUTER_KEY | |
h bench tasks $TASKS | |
h config set bench.parallel 4 | |
# ===== Reference tests | |
# # Lama 3.1 8B | |
h bench model meta-llama/llama-3.1-8b-instruct | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench api https://openrouter.ai/api | |
h bench key $OPENROUTER_KEY | |
h bench run --name openrouter-llama3.1-8b-${NAME} | |
# Lama 3.1 70B | |
h bench model meta-llama/llama-3.1-70b-instruct | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench api https://openrouter.ai/api | |
h bench key $OPENROUTER_KEY | |
h bench run --name openrouter-llama3.1-70b-${NAME} | |
# # Sonnet 3.5 | |
h bench model anthropic/claude-3.5-sonnet | |
h bench variants --temperature 0 --max_tokens 1024 | |
h bench api https://openrouter.ai/api | |
h bench key $OPENROUTER_KEY | |
h bench run --name openrouter-claude-3.5-sonnet-${NAME} | |
# ===== Local tests | |
# Ollama q2_K | |
h bench model llama3.1:8b-instruct-q2_K | |
h bench api http://harbor.ollama:11434 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name ollama-llama3.1-8b-q2_K-${NAME} | |
# # Ollama q4_0 | |
h bench model llama3.1:8b-instruct-q4_0 | |
h bench api http://harbor.ollama:11434 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name ollama-llama3.1-8b-q4_0-${NAME} | |
# # Ollama q6_K | |
h bench model llama3.1:8b-instruct-q6_K | |
h bench api http://harbor.ollama:11434 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name ollama-llama3.1-8b-q6_K-${NAME} | |
# # Ollama q8_0 | |
h bench model llama3.1:8b-instruct-q8_0 | |
h bench api http://harbor.ollama:11434 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name ollama-llama3.1-8b-q8_0-${NAME} | |
# # Ollama fp16 | |
h bench model llama3.1:8b-instruct-fp16 | |
h bench api http://harbor.ollama:11434 | |
h bench variants --temperature 0 --max_tokens 1024 | |
h bench run --name ollama-llama3.1-8b-fp16-${NAME} | |
# Llama.cpp Q8_0 | |
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf | |
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024 | |
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf | |
h bench api http://harbor.llamacpp:8080 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name llamacpp-llama3.1-8b-Q8_0-${NAME} | |
# Llama.cpp Q4_K_M | |
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf | |
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024 | |
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf | |
h bench api http://harbor.llamacpp:8080 | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name llamacpp-llama3.1-8b-Q4_K_M-${NAME} | |
# Aphrodite | |
h aphrodite model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h aphrodite args --quantization fp8 --max-model-len 2048 | |
h bench model $(h aphrodite model) | |
h bench api $(h url -i aphrodite) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name aphrodite-llama3.1-8B-fp8-${NAME} | |
# Mistral.rs Q8_0 | |
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h mistralrs arch llama | |
h mistralrs type plain | |
h mistralrs isq Q8_0 | |
h mistralrs args --no-paged-attn | |
h bench model $(h mistralrs model) | |
h bench api $(h url -i mistralrs) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name mistralrs-llama3.1-8B-ISQ8_0-${NAME} | |
# Mistral.rs Q6K | |
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h mistralrs arch llama | |
h mistralrs type plain | |
h mistralrs isq Q6K | |
h mistralrs args --no-paged-attn --truncate-sequence | |
h bench model $(h mistralrs model) | |
h bench api $(h url -i mistralrs) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name mistralrs-llama3.1-8B-ISQ6K-${NAME} | |
# Mistral.rs Q4K | |
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h mistralrs arch llama | |
h mistralrs type plain | |
h mistralrs isq Q4K | |
h mistralrs args --no-paged-attn --truncate-sequence | |
h bench model $(h mistralrs model) | |
h bench api $(h url -i mistralrs) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name mistralrs-llama3.1-8B-ISQ4K-${NAME} | |
# TabbyAPI 8bpw | |
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/77ba6c6d3e518d93aa4077433f52c8d73eddca44 | |
h tabbyapi args --max-seq-len 2048 | |
h bench model turboderp/Llama-3.1-8B-Instruct-exl2 | |
h bench api $(h url -i tabbyapi) | |
h bench key $(h config get tabbyapi.api.key) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name tabbyapi-llama3.1-8B-8bpw-${NAME} | |
# TabbyAPI 6bpw | |
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/ddb8769ce76c63fbd7e6baf4d0b2ffd8a1a221f9 | |
h tabbyapi args --max-seq-len 2048 | |
h bench model turboderp/Llama-3.1-8B-Instruct-exl2 | |
h bench api $(h url -i tabbyapi) | |
h bench key $(h config get tabbyapi.api.key) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name tabbyapi-llama3.1-8B-6bpw-${NAME} | |
# TabbyAPI 4bpw | |
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/cb83dde90c495cd21bd5cedd0eabf342aa602cbb | |
h tabbyapi args --max-seq-len 2048 | |
h bench model turboderp/Llama-3.1-8B-Instruct-exl2 | |
h bench api $(h url -i tabbyapi) | |
h bench key $(h config get tabbyapi.api.key) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name tabbyapi-llama3.1-8B-4bpw-${NAME} | |
# vLLM | |
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h vllm args --quantization fp8 --max-model-len 2048 | |
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h bench api $(h url -i vllm) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name vllm-llama3.1-8B-fp8-${NAME} | |
# vLLM bnb | |
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h vllm args --load-format bitsandbytes --quantization bitsandbytes --max-model-len 2048 | |
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct | |
h bench api $(h url -i vllm) | |
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024 | |
h bench run --name vllm-llama3.1-8B-bnb-${NAME} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment