Context: ggerganov/llama.cpp#2700
git clone https://github.com/ochafik/llama.cpp
git clone https://github.com/karpathy/llama2.c
cd llama.cpp
git checkout skip-unused-2
cmake -B ../build-skip . && ( cd ../build-skip && make -j )
cmake -B ../build-noskip . -DLLAMA_SKIP_UNUSED_LOGITS=0 && ( cd ../build-noskip && make -j )
make convert-llama2c-to-ggml quantize
# Get TinyLlamas models & convert to GGUF
for m in stories{110M,42M,15M} ; do
test -f $m.bin || wget https://huggingface.co/karpathy/tinyllamas/resolve/main/$m.bin
./convert-llama2c-to-ggml \
--copy-vocab-from-model ../llama2.c/tokenizer.model \
--llama2c-model $m.bin --llama2c-output-model $m.gguf
done
# Test
for m in stories{110M,42M,15M} ; do
hyperfine --warmup 2 --runs 50 \
-L build skip,noskip \
-L model $m.gguf \
"../build-{build}/bin/main -m {model} -f prompts/reason-act.txt -n 1 --temp 0 --ignore-eos -ngl 0"
done
Results (tested on M2 Max):
# stories110M.gguf: CPU = 1.15x faster, Metal = 1.03x faster
# stories42M.gguf: CPU = 1.20x faster, Metal = 1.06x faster
# stories15M.gguf: CPU = 1.25x faster, Metal = 1.09x faster