Skip to content

Instantly share code, notes, and snippets.

@ianscrivener
Created July 14, 2023 23:00
Show Gist options
  • Save ianscrivener/71bde7a2bfc92e8d217900229d78df51 to your computer and use it in GitHub Desktop.
Save ianscrivener/71bde7a2bfc92e8d217900229d78df51 to your computer and use it in GitHub Desktop.
setup NVidia GPU Docker for llama.cpp and run perplexity test
# BTW: we are running in a nvidia/cuda:11.x.x-devel-ubuntu22.04
# install some extra Ubuntu packages
apt install unzip libopenblas-dev nano git-lfs aria2c jq build-essential python3 python3-pip git -y
pip install --upgrade pip setuptools wheel
# clone llama.cpp repo
cd /workspace
git clone https://github.com/ggerganov/llama.cpp.git
# setup & build llama.cpp
cd /workspace/llama.cpp
pip install -r requirements.txt
pip install --upgrade pip
make LLAMA_CUBLAS=1 -j
## get Open Llama 3B v1
# mkdir -p /workspace/3b_open_llama_v1
# wget -O /workspace/3b_open_llama_v1/pytorch_model.bin https://huggingface.co/openlm-research/open_llama_3b/resolve/main/pytorch_model.bin
# wget -O /workspace/3b_open_llama_v1/tokenizer.model https://huggingface.co/openlm-research/open_llama_3b/resolve/main/tokenizer.model
# ls /workspace/3b_open_llama_v1
#
## convert model to ggml F16 format & delete original models
# cd /workspace/llama.cpp
# python3 convert.py /workspace/3b_open_llama_v1
# rm /workspace/3b_open_llama_v1/pytorch_model.bin
# rm /workspace/3b_open_llama_v1/tokenizer.model
# ls /workspace/3b_open_llama_v1
#
## quantize to q4_0
# cd /workspace/llama.cpp
# ./quantize /workspace/3b_open_llama_v1/ggml-model-f16.bin /workspace/3b_open_llama_v1/ggml-model-q4_0.bin q4_0
# get quantized models
mkdir -p /workspace/7b_open_llama_v1
cd /workspace/7b_open_llama_v1
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q2_K.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q3_K_S.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q3_K.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q3_K_L.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q4_0.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q4_1.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q4_K_S.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q4_K.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q5_0.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q5_K_S.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q5_1.bin
wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q5_K.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q6_K.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-q8_0.bin
# wget https://huggingface.co/SlyEcho/open_llama_7b_ggml/resolve/main/open-llama-7b-f16.bin
ls /workspace/7b_open_llama_v1
# get wiki text
cd /workspace
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip wikitext-2-raw-v1.zip
rm wikitext-2-raw-v1.zip
cd /workspace/wikitext-2-raw
head -n406 wiki.test.raw > wiki.test.raw.406
head -n103 wiki.test.raw > wiki.test.raw.103
# run perplexity
cd /workspace/llama.cpp
export model="/workspace/7b_open_llama_v1/open-llama-7b-q5_K.bin"
export corpus="/workspace/wikitext-2-raw/wiki.test.raw.103"
export context=512
export batch=512
export threads=8
export gpu=24
./perplexity -m $model -f $corpus -c $context -b $batch -t $threads -ngl $gpu
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment