Skip to content

Instantly share code, notes, and snippets.

@SteelPh0enix
Last active June 28, 2024 16:21
Show Gist options
  • Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
llama.cpp shell utils
#!/bin/zsh
# Collection of variables, aliases and functions to work w/ llama.cpp
# Source to activate.
# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
export ROCM_VERSION="6.0.2"
export USE_ROCM=1
export HIP_PLATFORM="amd"
export GPU_ARCHS="gfx1100"
export HSA_OVERRIDE_GFX_VERSION="11.0.0"
export ROCM_PATH="/opt/rocm"
export TF_PYTHON_VERSION="3.12"
export USE_SYMENGINE=1
# llama.cpp-related variables (tweak if necessary)
export LLAMA_CPP_PATH="${HOME}/llama.cpp"
export LLAMA_CPP_SERVER_ADDRESS="localhost"
export LLAMA_CPP_SERVER_PORT="51536"
export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_ADDRESS}:${LLAMA_CPP_SERVER_PORT}/"
export LLAMA_CPP_SERVER_CTX_SIZE=10240
export LLAMA_CPP_SERVER_BATCH_SIZE=4096
export LLAMA_CPP_SERVER_UBATCH_SIZE=2048
export LLAMA_CPP_SERVER_GPU_LAYERS=999
# test with LLAMA_HIP_UMA=ON in systems with iGPUs to check out how it behaves
export LLAMA_CPP_CMAKE_ARGS="-DGGML_NATIVE=ON -DGGML_HIPBLAS=ON -DGGML_CCACHE=OFF -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_OPENMP=ON -DAMDGPU_TARGETS=${GPU_ARCHS} -DGGML_CUDA_DMMV_X=128 -DGGML_CUDA_MMV_Y=4 -DLLAMA_CURL=ON -DLLAMA_STANDALONE=ON"
export LLAMA_CPP_DEFAULT_PROMPT="You are an AI assistant that follows instruction extremely well. Help as much as you can.\n"
export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"
# System-related variables
export NUMBER_OF_CORES=$(grep -c ^processor /proc/cpuinfo)
# ROCm-related variables
export GFX_ARCH=$GPU_ARCHS
export PYTORCH_ROCM_ARCH=$GPU_ARCHS
export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
export ROCM_INSTALL_DIR=$ROCM_PATH
export ROCM_TOOLKIT_PATH=$ROCM_PATH
export HIP_PATH=$ROCM_PATH
export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
export PATH="${PATH}:${HIP_PATH}"
export CPATH="${CPATH}:${ROCM_PATH}/include"
export C_INCLUDE_PATH="${C_INCLUDE_PATH}:${ROCM_PATH}/include"
export CPLUS_INCLUDE_PATH="${CPLUS_INCLUDE_PATH}:${ROCM_PATH}/include"
# generic llm-related functions
function llm-server() {
echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
llama-server --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --model $1
}
function llm-cli() {
local llama_prompt=${2:-${LLAMA_CPP_DEFAULT_PROMPT}}
echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
llama-cli --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --interactive-first --prompt $llama_prompt --model $1
}
function llm-quantize-model() {
local model_path=$1
local quantization=${2:-auto}
local output_path=${3:-.}
# Extract the model name from the model_path
local model_name=$(basename $model_path)
# Check if the model_path exists
if [ ! -d "$model_path" ]; then
echo "Error: Model directory '$model_path' does not exist."
return 1
fi
# Run the conversion command
python $LLAMA_CPP_PATH/convert-hf-to-gguf.py --outtype $quantization --outfile $output_path/$model_name.$quantization.gguf $model_path
# Check if the conversion was successful
if [ $? -eq 0 ]; then
echo "Model '$model_path' successfully quantized to $quantization format and saved as $output_path/$model_name.$quantization.gguf"
else
echo "Error: Failed to quantize model '$model_path'."
fi
}
function llm-requantize-model() {
local model_name=$1
local output_quantization=$2
local input_quantization=${3:-auto}
local num_threads=$(/usr/bin/nproc)
if [[ -f "${model_name}.${input_quantization}.gguf" ]]; then
llama-quantize "${model_name}.${input_quantization}.gguf" "${model_name}.${output_quantization}.gguf" "$output_quantization" "$num_threads"
echo "Model $model_name requantized successfully from $input_quantization to $output_quantization."
else
echo "Error: File ${model_name}.${input_quantization}.gguf not found."
return 1
fi
}
# llama.cpp management functions
function llm-llama-clone() {
echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git submodule update --init --recursive
git lfs pull
cd $og_pwd
}
function llm-llama-update() {
echo "Pulling latest llama.cpp commit..."
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git pull
cd $og_pwd
}
function llm-llama-clean() {
echo "Clearing llama.cpp repository from any build artifacts and junk..."
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
git clean -xddf
cd $og_pwd
}
function llm-llama-clean-update() {
llm-llama-clean
llm-llama-update
}
function llm-llama-build() {
local og_pwd=$(pwd)
cd $LLAMA_CPP_PATH
echo "Generating build files (CMake arguments: ${LLAMA_CPP_CMAKE_ARGS})"
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release ${(z)LLAMA_CPP_CMAKE_ARGS}
echo "Building llama.cpp..."
cmake --build build --config Release
cd $og_pwd
}
function llm-llama-clean-build() {
llm-llama-clean
llm-llama-build
}
function llm-llama-clean-update-build() {
llm-llama-clean
llm-llama-update
llm-llama-build
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment