Skip to content

Instantly share code, notes, and snippets.

@smellslikeml
Last active March 11, 2024 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smellslikeml/ec03efd39e5a4002f1ee34befe1b72d0 to your computer and use it in GitHub Desktop.
Save smellslikeml/ec03efd39e5a4002f1ee34befe1b72d0 to your computer and use it in GitHub Desktop.
Uses NATS to publish responses of LLM inference to the subject inference.requests
# Launch nats-server
# wget https://huggingface.co/remyxai/stablelm-zephyr-3B_localmentor/resolve/main/ggml-model-q4_0.gguf -o stablelm-localmentor_2.gguf
import nats
import asyncio
from llama_cpp import Llama
async def llm_runner(nats_url, model_path, subject):
nc = await nats.connect(nats_url)
llm = Llama(model_path)
async def inference_handler(msg):
data = msg.data.decode()
response = llm(data, max_tokens=2048, stop=["###", "\n\n"], echo=True)
r = response["choices"][0]["text"]
await nc.publish(msg.reply, str(r).encode())
await nc.subscribe(subject, cb=inference_handler)
await asyncio.Future()
if __name__ == "__main__":
asyncio.run(
llm_runner(
"nats://localhost:4222", "stablelm-localmentor.gguf", "inference.requests"
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment