ochafik/README.md Secret

## README.md

      
    Raw
  

              README.md
            
          
    Context: ggerganov/llama.cpp#2700
git clone https://github.com/ochafik/llama.cpp
git clone https://github.com/karpathy/llama2.c

cd llama.cpp
git checkout skip-unused-2

cmake -B ../build-skip . && ( cd ../build-skip && make -j )
cmake -B ../build-noskip . -DLLAMA_SKIP_UNUSED_LOGITS=0 && ( cd ../build-noskip && make -j )

make convert-llama2c-to-ggml quantize

# Get TinyLlamas models & convert to GGUF
for m in stories{110M,42M,15M} ; do
  test -f $m.bin || wget https://huggingface.co/karpathy/tinyllamas/resolve/main/$m.bin
  ./convert-llama2c-to-ggml \
    --copy-vocab-from-model ../llama2.c/tokenizer.model  \
    --llama2c-model $m.bin --llama2c-output-model $m.gguf
done

# Test
for m in stories{110M,42M,15M} ; do
  hyperfine --warmup 2 --runs 50 \
    -L build skip,noskip \
    -L model $m.gguf \
    "../build-{build}/bin/main -m {model} -f prompts/reason-act.txt -n 1 --temp 0 --ignore-eos -ngl 0"
done
Results (tested on M2 Max):
# stories110M.gguf: CPU = 1.15x faster, Metal = 1.03x faster
# stories42M.gguf: CPU = 1.20x faster, Metal = 1.06x faster
# stories15M.gguf: CPU = 1.25x faster, Metal = 1.09x faster