av/engbench.sh

## engbench.sh
#!/bin/bash

# Note that you're not expected to run this
# file as is in one go

OPENROUTER_KEY=<your_openrouter_key>
TASKS=<path_to_tasks_file>
NAME=engbench


# # Install Harbor
# curl https://av.codes/get-harbor.sh | bash

# # Pre-pull/build relevant images
# h pull ollama
# h pull llamacpp
# h pull aphrodite
# h pull mistralrs
# h pull tabbyapi
# h pull vllm
# h build bench

# Common
h bench judge meta-llama/llama-3.1-70b-instruct
h bench judge_api https://openrouter.ai/api
h bench judge_key $OPENROUTER_KEY
h bench tasks $TASKS
h config set bench.parallel 4

# ===== Reference tests

# # Lama 3.1 8B
h bench model meta-llama/llama-3.1-8b-instruct
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-llama3.1-8b-${NAME}

# Lama 3.1 70B
h bench model meta-llama/llama-3.1-70b-instruct
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-llama3.1-70b-${NAME}

# # Sonnet 3.5
h bench model anthropic/claude-3.5-sonnet
h bench variants --temperature 0 --max_tokens 1024
h bench api https://openrouter.ai/api
h bench key $OPENROUTER_KEY
h bench run --name openrouter-claude-3.5-sonnet-${NAME}

# ===== Local tests

# Ollama q2_K
h bench model llama3.1:8b-instruct-q2_K
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q2_K-${NAME}

# # Ollama q4_0
h bench model llama3.1:8b-instruct-q4_0
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q4_0-${NAME}

# # Ollama q6_K
h bench model llama3.1:8b-instruct-q6_K
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q6_K-${NAME}

# # Ollama q8_0
h bench model llama3.1:8b-instruct-q8_0
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-q8_0-${NAME}

# # Ollama fp16
h bench model llama3.1:8b-instruct-fp16
h bench api http://harbor.ollama:11434
h bench variants --temperature 0 --max_tokens 1024
h bench run --name ollama-llama3.1-8b-fp16-${NAME}

# Llama.cpp Q8_0
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
h bench api http://harbor.llamacpp:8080
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name llamacpp-llama3.1-8b-Q8_0-${NAME}

# Llama.cpp Q4_K_M
h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
h bench api http://harbor.llamacpp:8080
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name llamacpp-llama3.1-8b-Q4_K_M-${NAME}

# Aphrodite
h aphrodite model meta-llama/Meta-Llama-3.1-8B-Instruct
h aphrodite args --quantization fp8 --max-model-len 2048
h bench model $(h aphrodite model)
h bench api $(h url -i aphrodite)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name aphrodite-llama3.1-8B-fp8-${NAME}

# Mistral.rs Q8_0
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q8_0
h mistralrs args --no-paged-attn
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ8_0-${NAME}

# Mistral.rs Q6K
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q6K
h mistralrs args --no-paged-attn --truncate-sequence
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ6K-${NAME}

# Mistral.rs Q4K
h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
h mistralrs arch llama
h mistralrs type plain
h mistralrs isq Q4K
h mistralrs args --no-paged-attn --truncate-sequence
h bench model $(h mistralrs model)
h bench api $(h url -i mistralrs)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name mistralrs-llama3.1-8B-ISQ4K-${NAME}

# TabbyAPI 8bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/77ba6c6d3e518d93aa4077433f52c8d73eddca44
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-8bpw-${NAME}

# TabbyAPI 6bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/ddb8769ce76c63fbd7e6baf4d0b2ffd8a1a221f9
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-6bpw-${NAME}

# TabbyAPI 4bpw
h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/cb83dde90c495cd21bd5cedd0eabf342aa602cbb
h tabbyapi args --max-seq-len 2048
h bench model turboderp/Llama-3.1-8B-Instruct-exl2
h bench api $(h url -i tabbyapi)
h bench key $(h config get tabbyapi.api.key)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name tabbyapi-llama3.1-8B-4bpw-${NAME}

# vLLM
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
h vllm args --quantization fp8 --max-model-len 2048
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
h bench api $(h url -i vllm)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name vllm-llama3.1-8B-fp8-${NAME}

# vLLM bnb
h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
h vllm args --load-format bitsandbytes --quantization bitsandbytes --max-model-len 2048
h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
h bench api $(h url -i vllm)
h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
h bench run --name vllm-llama3.1-8B-bnb-${NAME}
	#!/bin/bash

	# Note that you're not expected to run this
	# file as is in one go

	OPENROUTER_KEY=<your_openrouter_key>
	TASKS=<path_to_tasks_file>
	NAME=engbench


	# # Install Harbor
	# curl https://av.codes/get-harbor.sh \| bash

	# # Pre-pull/build relevant images
	# h pull ollama
	# h pull llamacpp
	# h pull aphrodite
	# h pull mistralrs
	# h pull tabbyapi
	# h pull vllm
	# h build bench

	# Common
	h bench judge meta-llama/llama-3.1-70b-instruct
	h bench judge_api https://openrouter.ai/api
	h bench judge_key $OPENROUTER_KEY
	h bench tasks $TASKS
	h config set bench.parallel 4

	# ===== Reference tests

	# # Lama 3.1 8B
	h bench model meta-llama/llama-3.1-8b-instruct
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench api https://openrouter.ai/api
	h bench key $OPENROUTER_KEY
	h bench run --name openrouter-llama3.1-8b-${NAME}

	# Lama 3.1 70B
	h bench model meta-llama/llama-3.1-70b-instruct
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench api https://openrouter.ai/api
	h bench key $OPENROUTER_KEY
	h bench run --name openrouter-llama3.1-70b-${NAME}

	# # Sonnet 3.5
	h bench model anthropic/claude-3.5-sonnet
	h bench variants --temperature 0 --max_tokens 1024
	h bench api https://openrouter.ai/api
	h bench key $OPENROUTER_KEY
	h bench run --name openrouter-claude-3.5-sonnet-${NAME}

	# ===== Local tests

	# Ollama q2_K
	h bench model llama3.1:8b-instruct-q2_K
	h bench api http://harbor.ollama:11434
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name ollama-llama3.1-8b-q2_K-${NAME}

	# # Ollama q4_0
	h bench model llama3.1:8b-instruct-q4_0
	h bench api http://harbor.ollama:11434
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name ollama-llama3.1-8b-q4_0-${NAME}

	# # Ollama q6_K
	h bench model llama3.1:8b-instruct-q6_K
	h bench api http://harbor.ollama:11434
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name ollama-llama3.1-8b-q6_K-${NAME}

	# # Ollama q8_0
	h bench model llama3.1:8b-instruct-q8_0
	h bench api http://harbor.ollama:11434
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name ollama-llama3.1-8b-q8_0-${NAME}

	# # Ollama fp16
	h bench model llama3.1:8b-instruct-fp16
	h bench api http://harbor.ollama:11434
	h bench variants --temperature 0 --max_tokens 1024
	h bench run --name ollama-llama3.1-8b-fp16-${NAME}

	# Llama.cpp Q8_0
	h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
	h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
	h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf
	h bench api http://harbor.llamacpp:8080
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name llamacpp-llama3.1-8b-Q8_0-${NAME}

	# Llama.cpp Q4_K_M
	h llamacpp model https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
	h llamacpp args -ngl 99 --ctx-size 8192 -np 4 --n-predict 1024
	h bench model /root/.cache/llama.cpp/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
	h bench api http://harbor.llamacpp:8080
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name llamacpp-llama3.1-8b-Q4_K_M-${NAME}

	# Aphrodite
	h aphrodite model meta-llama/Meta-Llama-3.1-8B-Instruct
	h aphrodite args --quantization fp8 --max-model-len 2048
	h bench model $(h aphrodite model)
	h bench api $(h url -i aphrodite)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name aphrodite-llama3.1-8B-fp8-${NAME}

	# Mistral.rs Q8_0
	h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
	h mistralrs arch llama
	h mistralrs type plain
	h mistralrs isq Q8_0
	h mistralrs args --no-paged-attn
	h bench model $(h mistralrs model)
	h bench api $(h url -i mistralrs)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name mistralrs-llama3.1-8B-ISQ8_0-${NAME}

	# Mistral.rs Q6K
	h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
	h mistralrs arch llama
	h mistralrs type plain
	h mistralrs isq Q6K
	h mistralrs args --no-paged-attn --truncate-sequence
	h bench model $(h mistralrs model)
	h bench api $(h url -i mistralrs)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name mistralrs-llama3.1-8B-ISQ6K-${NAME}

	# Mistral.rs Q4K
	h mistralrs model meta-llama/Meta-Llama-3.1-8B-Instruct
	h mistralrs arch llama
	h mistralrs type plain
	h mistralrs isq Q4K
	h mistralrs args --no-paged-attn --truncate-sequence
	h bench model $(h mistralrs model)
	h bench api $(h url -i mistralrs)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name mistralrs-llama3.1-8B-ISQ4K-${NAME}

	# TabbyAPI 8bpw
	h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/77ba6c6d3e518d93aa4077433f52c8d73eddca44
	h tabbyapi args --max-seq-len 2048
	h bench model turboderp/Llama-3.1-8B-Instruct-exl2
	h bench api $(h url -i tabbyapi)
	h bench key $(h config get tabbyapi.api.key)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name tabbyapi-llama3.1-8B-8bpw-${NAME}

	# TabbyAPI 6bpw
	h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/ddb8769ce76c63fbd7e6baf4d0b2ffd8a1a221f9
	h tabbyapi args --max-seq-len 2048
	h bench model turboderp/Llama-3.1-8B-Instruct-exl2
	h bench api $(h url -i tabbyapi)
	h bench key $(h config get tabbyapi.api.key)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name tabbyapi-llama3.1-8B-6bpw-${NAME}

	# TabbyAPI 4bpw
	h config set tabbyapi.model.specifier hub/models--turboderp--Llama-3.1-8B-Instruct-exl2/snapshots/cb83dde90c495cd21bd5cedd0eabf342aa602cbb
	h tabbyapi args --max-seq-len 2048
	h bench model turboderp/Llama-3.1-8B-Instruct-exl2
	h bench api $(h url -i tabbyapi)
	h bench key $(h config get tabbyapi.api.key)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name tabbyapi-llama3.1-8B-4bpw-${NAME}

	# vLLM
	h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
	h vllm args --quantization fp8 --max-model-len 2048
	h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
	h bench api $(h url -i vllm)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name vllm-llama3.1-8B-fp8-${NAME}

	# vLLM bnb
	h vllm model meta-llama/Meta-Llama-3.1-8B-Instruct
	h vllm args --load-format bitsandbytes --quantization bitsandbytes --max-model-len 2048
	h bench model meta-llama/Meta-Llama-3.1-8B-Instruct
	h bench api $(h url -i vllm)
	h bench variants --temperature 0 --temperature 0.5 --temperature 1.0 --max_tokens 1024
	h bench run --name vllm-llama3.1-8B-bnb-${NAME}