ftnext/phi3_llama_cpp_cpu.py

## phi3_llama_cpp_cpu.py
# /// script
# dependencies = ["llama-cpp-python"]
# ///

import os

from llama_cpp import Llama

llm = Llama(
    model_path=os.path.expanduser(
        "~/.ollama/models/blobs/sha256-4fed7364ee3e0c7cb4fe0880148bfdfcd1b630981efa0802a6b62ee52e7da97e"
    ),
    n_ctx=1024,
    n_threads=8,
    n_gpu_layers=0,
)

prompt = "How to explain Internet to a medieval knight?"

output = llm(
    f"<|user|>\n{prompt}<|end|>\n<|assistant|>",
    max_tokens=256,
    stop=["<|end|>"],
    echo=True,
    temperature=0.0,
)
print(output["choices"][0]["text"])
	# /// script
	# dependencies = ["llama-cpp-python"]
	# ///

	import os

	from llama_cpp import Llama

	llm = Llama(
	model_path=os.path.expanduser(
	"~/.ollama/models/blobs/sha256-4fed7364ee3e0c7cb4fe0880148bfdfcd1b630981efa0802a6b62ee52e7da97e"
	),
	n_ctx=1024,
	n_threads=8,
	n_gpu_layers=0,
	)

	prompt = "How to explain Internet to a medieval knight?"

	output = llm(
	f"<\|user\|>\n{prompt}<\|end\|>\n<\|assistant\|>",
	max_tokens=256,
	stop=["<\|end\|>"],
	echo=True,
	temperature=0.0,
	)
	print(output["choices"][0]["text"])