SteelPh0enix/llama-cpp-utils.sh

## llama-cpp-utils.sh
#!/bin/zsh

# Collection of variables, aliases and functions to work w/ llama.cpp
# Source to activate.

# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
export ROCM_VERSION="6.0.2"
export USE_ROCM=1
export HIP_PLATFORM="amd"
export GPU_ARCHS="gfx1100"
export HSA_OVERRIDE_GFX_VERSION="11.0.0"
export ROCM_PATH="/opt/rocm"
export TF_PYTHON_VERSION="3.12"
export USE_SYMENGINE=1

# llama.cpp-related variables (tweak if necessary)
export LLAMA_CPP_PATH="${HOME}/llama.cpp"
export LLAMA_CPP_SERVER_ADDRESS="localhost"
export LLAMA_CPP_SERVER_PORT="51536"
export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_ADDRESS}:${LLAMA_CPP_SERVER_PORT}/"
export LLAMA_CPP_SERVER_CTX_SIZE=10240
export LLAMA_CPP_SERVER_BATCH_SIZE=4096
export LLAMA_CPP_SERVER_UBATCH_SIZE=2048
export LLAMA_CPP_SERVER_GPU_LAYERS=999
# test with LLAMA_HIP_UMA=ON in systems with iGPUs to check out how it behaves
export LLAMA_CPP_CMAKE_ARGS="-DGGML_NATIVE=ON -DGGML_HIPBLAS=ON -DGGML_CCACHE=OFF -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_OPENMP=ON -DAMDGPU_TARGETS=${GPU_ARCHS} -DGGML_CUDA_DMMV_X=128 -DGGML_CUDA_MMV_Y=4 -DLLAMA_CURL=ON -DLLAMA_STANDALONE=ON"
export LLAMA_CPP_DEFAULT_PROMPT="You are an AI assistant that follows instruction extremely well. Help as much as you can.\n"

export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"

# System-related variables
export NUMBER_OF_CORES=$(grep -c ^processor /proc/cpuinfo)

# ROCm-related variables
export GFX_ARCH=$GPU_ARCHS
export PYTORCH_ROCM_ARCH=$GPU_ARCHS
export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
export ROCM_INSTALL_DIR=$ROCM_PATH
export ROCM_TOOLKIT_PATH=$ROCM_PATH
export HIP_PATH=$ROCM_PATH
export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
export PATH="${PATH}:${HIP_PATH}"
export CPATH="${CPATH}:${ROCM_PATH}/include"
export C_INCLUDE_PATH="${C_INCLUDE_PATH}:${ROCM_PATH}/include"
export CPLUS_INCLUDE_PATH="${CPLUS_INCLUDE_PATH}:${ROCM_PATH}/include"

# generic llm-related functions
function llm-server() {
    echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
    llama-server --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --model $1
}

function llm-cli() {
    local llama_prompt=${2:-${LLAMA_CPP_DEFAULT_PROMPT}}
    echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
    llama-cli --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --interactive-first --prompt $llama_prompt --model $1
}

function llm-quantize-model() {
  local model_path=$1
  local quantization=${2:-auto}
  local output_path=${3:-.}

  # Extract the model name from the model_path
  local model_name=$(basename $model_path)

  # Check if the model_path exists
  if [ ! -d "$model_path" ]; then
    echo "Error: Model directory '$model_path' does not exist."
    return 1
  fi

  # Run the conversion command
  python $LLAMA_CPP_PATH/convert-hf-to-gguf.py --outtype $quantization --outfile $output_path/$model_name.$quantization.gguf $model_path

  # Check if the conversion was successful
  if [ $? -eq 0 ]; then
    echo "Model '$model_path' successfully quantized to $quantization format and saved as $output_path/$model_name.$quantization.gguf"
  else
    echo "Error: Failed to quantize model '$model_path'."
  fi
}

function llm-requantize-model() {
  local model_name=$1
  local output_quantization=$2
  local input_quantization=${3:-auto}
  local num_threads=$(/usr/bin/nproc)

  if [[ -f "${model_name}.${input_quantization}.gguf" ]]; then
    llama-quantize "${model_name}.${input_quantization}.gguf" "${model_name}.${output_quantization}.gguf" "$output_quantization" "$num_threads"
    echo "Model $model_name requantized successfully from $input_quantization to $output_quantization."
  else
    echo "Error: File ${model_name}.${input_quantization}.gguf not found."
    return 1
  fi
}

# llama.cpp management functions
function llm-llama-clone() {
    echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
    git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git submodule update --init --recursive
    git lfs pull
    cd $og_pwd
}

function llm-llama-update() {
    echo "Pulling latest llama.cpp commit..."
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git pull
    cd $og_pwd
}

function llm-llama-clean() {
    echo "Clearing llama.cpp repository from any build artifacts and junk..."
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH
    git clean -xddf
    cd $og_pwd
}

function llm-llama-clean-update() {
    llm-llama-clean
    llm-llama-update
}

function llm-llama-build() {
    local og_pwd=$(pwd)
    cd $LLAMA_CPP_PATH

    echo "Generating build files (CMake arguments: ${LLAMA_CPP_CMAKE_ARGS})"
    cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release ${(z)LLAMA_CPP_CMAKE_ARGS}
    echo "Building llama.cpp..."
    cmake --build build --config Release

    cd $og_pwd
}

function llm-llama-clean-build() {
    llm-llama-clean
    llm-llama-build
}

function llm-llama-clean-update-build() {
    llm-llama-clean
    llm-llama-update
    llm-llama-build
}
	#!/bin/zsh

	# Collection of variables, aliases and functions to work w/ llama.cpp
	# Source to activate.

	# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
	export ROCM_VERSION="6.0.2"
	export USE_ROCM=1
	export HIP_PLATFORM="amd"
	export GPU_ARCHS="gfx1100"
	export HSA_OVERRIDE_GFX_VERSION="11.0.0"
	export ROCM_PATH="/opt/rocm"
	export TF_PYTHON_VERSION="3.12"
	export USE_SYMENGINE=1

	# llama.cpp-related variables (tweak if necessary)
	export LLAMA_CPP_PATH="${HOME}/llama.cpp"
	export LLAMA_CPP_SERVER_ADDRESS="localhost"
	export LLAMA_CPP_SERVER_PORT="51536"
	export LLAMA_CPP_SERVER_URL="http://${LLAMA_CPP_SERVER_ADDRESS}:${LLAMA_CPP_SERVER_PORT}/"
	export LLAMA_CPP_SERVER_CTX_SIZE=10240
	export LLAMA_CPP_SERVER_BATCH_SIZE=4096
	export LLAMA_CPP_SERVER_UBATCH_SIZE=2048
	export LLAMA_CPP_SERVER_GPU_LAYERS=999
	# test with LLAMA_HIP_UMA=ON in systems with iGPUs to check out how it behaves
	export LLAMA_CPP_CMAKE_ARGS="-DGGML_NATIVE=ON -DGGML_HIPBLAS=ON -DGGML_CCACHE=OFF -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_OPENMP=ON -DAMDGPU_TARGETS=${GPU_ARCHS} -DGGML_CUDA_DMMV_X=128 -DGGML_CUDA_MMV_Y=4 -DLLAMA_CURL=ON -DLLAMA_STANDALONE=ON"
	export LLAMA_CPP_DEFAULT_PROMPT="You are an AI assistant that follows instruction extremely well. Help as much as you can.\n"

	export PATH="${PATH}:${LLAMA_CPP_PATH}/build/bin"
	export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"

	# System-related variables
	export NUMBER_OF_CORES=$(grep -c ^processor /proc/cpuinfo)

	# ROCm-related variables
	export GFX_ARCH=$GPU_ARCHS
	export PYTORCH_ROCM_ARCH=$GPU_ARCHS
	export TF_ROCM_AMDGPU_TARGETS=$GPU_ARCHS
	export ROCM_INSTALL_DIR=$ROCM_PATH
	export ROCM_TOOLKIT_PATH=$ROCM_PATH
	export HIP_PATH=$ROCM_PATH
	export HIPCXX="${ROCM_PATH}/llvm/bin/clang"
	export PATH="${PATH}:${HIP_PATH}"
	export CPATH="${CPATH}:${ROCM_PATH}/include"
	export C_INCLUDE_PATH="${C_INCLUDE_PATH}:${ROCM_PATH}/include"
	export CPLUS_INCLUDE_PATH="${CPLUS_INCLUDE_PATH}:${ROCM_PATH}/include"

	# generic llm-related functions
	function llm-server() {
	echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
	llama-server --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --model $1
	}

	function llm-cli() {
	local llama_prompt=${2:-${LLAMA_CPP_DEFAULT_PROMPT}}
	echo "Running $1 using llama-server @ ${LLAMA_CPP_SERVER_URL} w/ ${NUMBER_OF_CORES} CPU cores, ${LLAMA_CPP_SERVER_GPU_LAYERS} GPU layers, and ${LLAMA_CPP_SERVER_CTX_SIZE} context size"
	llama-cli --color --log-enable --batch-size ${LLAMA_CPP_SERVER_BATCH_SIZE} --ubatch-size ${LLAMA_CPP_SERVER_UBATCH_SIZE} --threads ${NUMBER_OF_CORES} --mlock --gpu-layers ${LLAMA_CPP_SERVER_GPU_LAYERS} --ctx-size ${LLAMA_CPP_SERVER_CTX_SIZE} --port ${LLAMA_CPP_SERVER_PORT} --flash-attn --interactive-first --prompt $llama_prompt --model $1
	}

	function llm-quantize-model() {
	local model_path=$1
	local quantization=${2:-auto}
	local output_path=${3:-.}

	# Extract the model name from the model_path
	local model_name=$(basename $model_path)

	# Check if the model_path exists
	if [ ! -d "$model_path" ]; then
	echo "Error: Model directory '$model_path' does not exist."
	return 1
	fi

	# Run the conversion command
	python $LLAMA_CPP_PATH/convert-hf-to-gguf.py --outtype $quantization --outfile $output_path/$model_name.$quantization.gguf $model_path

	# Check if the conversion was successful
	if [ $? -eq 0 ]; then
	echo "Model '$model_path' successfully quantized to $quantization format and saved as $output_path/$model_name.$quantization.gguf"
	else
	echo "Error: Failed to quantize model '$model_path'."
	fi
	}

	function llm-requantize-model() {
	local model_name=$1
	local output_quantization=$2
	local input_quantization=${3:-auto}
	local num_threads=$(/usr/bin/nproc)

	if [[ -f "${model_name}.${input_quantization}.gguf" ]]; then
	llama-quantize "${model_name}.${input_quantization}.gguf" "${model_name}.${output_quantization}.gguf" "$output_quantization" "$num_threads"
	echo "Model $model_name requantized successfully from $input_quantization to $output_quantization."
	else
	echo "Error: File ${model_name}.${input_quantization}.gguf not found."
	return 1
	fi
	}

	# llama.cpp management functions
	function llm-llama-clone() {
	echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
	git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git submodule update --init --recursive
	git lfs pull
	cd $og_pwd
	}

	function llm-llama-update() {
	echo "Pulling latest llama.cpp commit..."
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git pull
	cd $og_pwd
	}

	function llm-llama-clean() {
	echo "Clearing llama.cpp repository from any build artifacts and junk..."
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH
	git clean -xddf
	cd $og_pwd
	}

	function llm-llama-clean-update() {
	llm-llama-clean
	llm-llama-update
	}

	function llm-llama-build() {
	local og_pwd=$(pwd)
	cd $LLAMA_CPP_PATH

	echo "Generating build files (CMake arguments: ${LLAMA_CPP_CMAKE_ARGS})"
	cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release ${(z)LLAMA_CPP_CMAKE_ARGS}
	echo "Building llama.cpp..."
	cmake --build build --config Release

	cd $og_pwd
	}

	function llm-llama-clean-build() {
	llm-llama-clean
	llm-llama-build
	}

	function llm-llama-clean-update-build() {
	llm-llama-clean
	llm-llama-update
	llm-llama-build
	}