Skip to content

Instantly share code, notes, and snippets.

@marvs
Created April 14, 2026 14:16
Show Gist options
  • Select an option

  • Save marvs/f8535b4ff9d9b6351fbf8c690b9ff883 to your computer and use it in GitHub Desktop.

Select an option

Save marvs/f8535b4ff9d9b6351fbf8c690b9ff883 to your computer and use it in GitHub Desktop.
Simple Python script that wraps a local LLM with an OpenAI-compatible API
import json
import os
import re
import time
import uuid
from threading import Thread
from typing import Any, Dict, List, Optional
from fastapi import FastAPI, Header, HTTPException
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, TextIteratorStreamer
from optimum.intel import OVModelForCausalLM
# ----------------------------
# Config
# ----------------------------
MODEL_ID = "qwen25-coder-7b"
MODEL_DIR = "qwen25-coder-7b-ov" # change if your folder name differs
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
DEVICE = "GPU" # change to "CPU" if needed
API_KEY = os.environ.get("LOCAL_OPENAI_API_KEY", "local-dev-key")
FALLBACK_SYSTEM_PROMPT = (
"You are a concise and capable coding assistant. "
"Prefer correct, practical answers. "
"For code, keep explanations tight unless asked for detail."
)
TOOL_USE_SYSTEM_PROMPT = (
"You are a coding assistant operating in agent mode. "
"You have access to tools to read and write files. "
"CRITICAL RULES:\n"
"- You MUST use tools to perform actions. NEVER describe or simulate actions in text.\n"
"- NEVER say 'I have created', 'I have saved', or 'I have updated' without calling a tool.\n"
"- NEVER output file contents in a code block as a substitute for calling a tool.\n"
"- When asked to create or edit a file, respond ONLY with the tool call JSON. No prose before or after.\n"
"- After a tool result is returned to you, respond with a short plain text confirmation. Do NOT call the tool again.\n"
"- A tool call must be a raw JSON object in this exact format:\n"
"```json\n{\"name\": \"tool_name\", \"arguments\": {\"param\": \"value\"}}\n```"
)
# ----------------------------
# Load model once at startup
# ----------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device=DEVICE)
# ----------------------------
# FastAPI app
# ----------------------------
app = FastAPI(title="Local OpenAI-Compatible Server")
# ----------------------------
# Request schemas
# ----------------------------
class ChatMessage(BaseModel):
role: str
content: Any
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
stream: Optional[bool] = False
temperature: Optional[float] = None
top_p: Optional[float] = None
max_tokens: Optional[int] = 512
tools: Optional[List[Dict[str, Any]]] = None
tool_choice: Optional[Any] = None
# ----------------------------
# Helpers
# ----------------------------
def require_auth(authorization: Optional[str]) -> None:
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing bearer token")
token = authorization.removeprefix("Bearer ").strip()
if token != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API key")
def normalize_content(content: Any) -> str:
if content is None:
return ""
if isinstance(content, str):
return content
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict):
if item.get("type") == "text":
parts.append(item.get("text", ""))
elif "content" in item:
parts.append(str(item["content"]))
else:
parts.append(str(item))
return "\n".join(x for x in parts if x)
return str(content)
def build_messages(messages: List[ChatMessage], tools=None) -> List[Dict[str, str]]:
"""
Preserve incoming system messages from the client (Continue Agent mode relies on them).
Only add a fallback/tool-use system prompt if the client sent none.
"""
out: List[Dict[str, str]] = []
for m in messages:
role = m.role
if role not in ("system", "user", "assistant", "tool"): # "tool" allowed through
continue
content = normalize_content(m.content)
if not content or not content.strip():
continue
out.append({"role": role, "content": content.strip()})
if not any(msg["role"] == "system" for msg in out):
system_prompt = TOOL_USE_SYSTEM_PROMPT if tools else FALLBACK_SYSTEM_PROMPT
out.insert(0, {"role": "system", "content": system_prompt})
return out
def maybe_inject_tool_reminder(messages: List[Dict[str, str]], tools) -> List[Dict[str, str]]:
"""
If the last user message implies a file action and tools are available,
append a reminder right before generation so the model doesn't respond in prose.
Skip if the last message is a tool result — the model should wrap up, not call again.
"""
if not tools:
return messages
# If the last message is a tool result, don't push the model to call another tool
last_msg = messages[-1] if messages else None
if last_msg and last_msg["role"] == "tool":
return messages
last_user = next(
(m for m in reversed(messages) if m["role"] == "user"), None
)
if not last_user:
return messages
action_keywords = ["create", "save", "write", "edit", "update", "make", "add"]
if any(kw in last_user["content"].lower() for kw in action_keywords):
messages = messages.copy()
messages.append({
"role": "user",
"content": (
"Remember: do not describe the action in text. "
"Call the appropriate tool directly using a ```json tool call."
)
})
return messages
def make_prompt(messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None) -> str:
"""
Pass tools into the chat template so Qwen2.5-Instruct knows how to emit tool calls.
"""
kwargs = dict(
tokenize=False,
add_generation_prompt=True,
)
if tools:
kwargs["tools"] = tools
return tokenizer.apply_chat_template(messages, **kwargs)
def clean_response(text: str) -> str:
if text is None:
return ""
text = str(text)
for marker in ["<|im_end|>", "<|endoftext|>", "User:", "Assistant:"]:
if marker in text:
text = text.split(marker)[0]
return text.strip()
def parse_tool_calls(text: str):
"""
Detect tool calls in two formats:
1. Qwen native: <tool_call>{"name": ..., "arguments": ...}</tool_call>
2. Fallback: ```json\n{"name": ..., "arguments": ...}\n```
"""
# Format 1: native Qwen tags
pattern = r"<tool_call>(.*?)</tool_call>"
matches = re.findall(pattern, text, re.DOTALL)
# Format 2: ```json code block fallback
if not matches:
pattern = r"```json\s*(\{.*?\})\s*```"
matches = re.findall(pattern, text, re.DOTALL)
if not matches:
return None, text
tool_calls = []
for match in matches:
try:
parsed = json.loads(match.strip())
# Support both {"name": ..., "arguments": ...}
# and {"name": ..., "parameters": ...} (some models vary)
arguments = parsed.get("arguments") or parsed.get("parameters") or {}
tool_calls.append({
"id": f"call_{uuid.uuid4().hex[:8]}",
"type": "function",
"function": {
"name": parsed.get("name", ""),
"arguments": json.dumps(arguments)
}
})
except json.JSONDecodeError:
pass
if not tool_calls:
return None, text
return tool_calls, None
def build_generation_kwargs(inputs, max_new_tokens: int, streamer=None):
kwargs = dict(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
repetition_penalty=1.05,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
)
if streamer is not None:
kwargs["streamer"] = streamer
return kwargs
def log_messages(messages: List[Dict[str, str]]) -> None:
print("----- INCOMING MESSAGES -----")
for msg in messages:
print(f"{msg['role'].upper()}:")
print(msg["content"][:4000])
print("-----")
# ----------------------------
# Routes
# ----------------------------
@app.get("/health")
def health():
return {"ok": True, "model": MODEL_ID, "device": DEVICE}
@app.get("/v1/models")
def list_models(authorization: Optional[str] = Header(None)):
require_auth(authorization)
return {
"object": "list",
"data": [
{
"id": MODEL_ID,
"object": "model",
"owned_by": "local"
}
]
}
@app.post("/v1/chat/completions")
def chat_completions(
body: ChatCompletionRequest,
authorization: Optional[str] = Header(None)
):
require_auth(authorization)
if body.model != MODEL_ID:
raise HTTPException(
status_code=400,
detail=f"Unknown model '{body.model}'. Expected '{MODEL_ID}'."
)
messages = build_messages(body.messages, tools=body.tools)
messages = maybe_inject_tool_reminder(messages, tools=body.tools)
log_messages(messages)
prompt = make_prompt(messages, tools=body.tools)
print("----- TOOLS RECEIVED -----")
print(json.dumps(body.tools, indent=2) if body.tools else "NO TOOLS")
print("----- PROMPT SENT TO MODEL -----")
print(prompt[:3000])
print("---------------------------------")
inputs = tokenizer(prompt, return_tensors="pt")
max_new_tokens = min(max(body.max_tokens or 512, 1), 2048)
completion_id = f"chatcmpl-{uuid.uuid4().hex}"
created = int(time.time())
# Streaming mode
if body.stream:
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
generation_kwargs = build_generation_kwargs(
inputs=inputs,
max_new_tokens=max_new_tokens,
streamer=streamer,
)
def event_stream():
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Buffer the full response first so we can detect tool calls
full_response = ""
try:
for chunk in streamer:
if chunk:
full_response += chunk
finally:
thread.join()
full_response = clean_response(full_response)
print("----- RAW MODEL OUTPUT (stream) -----")
print(full_response[:2000])
print("-------------------------------------")
tool_calls, plain_content = parse_tool_calls(full_response)
# Tool call response — emit as a single SSE chunk
if tool_calls:
payload = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": MODEL_ID,
"choices": [
{
"index": 0,
"delta": {
"role": "assistant",
"content": None,
"tool_calls": [
{
**tc,
"function": {
"name": tc["function"]["name"],
"arguments": tc["function"]["arguments"]
}
}
for tc in tool_calls
]
},
"finish_reason": "tool_calls"
}
]
}
yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
return
# Plain text — re-emit word by word so Continue still sees streaming
words = re.split(r'(\s+)', plain_content)
for word in words:
if not word:
continue
payload = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": MODEL_ID,
"choices": [
{
"index": 0,
"delta": {"content": word},
"finish_reason": None
}
]
}
yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
final_payload = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": MODEL_ID,
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop"
}
]
}
yield f"data: {json.dumps(final_payload, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(event_stream(), media_type="text/event-stream")
# Non-streaming mode
generation_kwargs = build_generation_kwargs(
inputs=inputs,
max_new_tokens=max_new_tokens,
streamer=None,
)
start = time.time()
output = model.generate(**generation_kwargs)
end = time.time()
input_len = inputs["input_ids"].shape[1]
generated_ids = output[0][input_len:]
response_text = clean_response(
tokenizer.decode(generated_ids, skip_special_tokens=True)
)
print("----- RAW MODEL OUTPUT (non-stream) -----")
print(response_text[:2000])
print("-----------------------------------------")
tool_calls, plain_content = parse_tool_calls(response_text)
if tool_calls:
message = {
"role": "assistant",
"content": None,
"tool_calls": tool_calls
}
finish_reason = "tool_calls"
else:
message = {
"role": "assistant",
"content": plain_content
}
finish_reason = "stop"
usage = {
"prompt_tokens": int(input_len),
"completion_tokens": int(len(generated_ids)),
"total_tokens": int(input_len + len(generated_ids)),
}
print(
f"Model={MODEL_ID} Device={DEVICE} "
f"Time={end - start:.2f}s "
f"PromptTokens={usage['prompt_tokens']} "
f"CompletionTokens={usage['completion_tokens']}"
)
return JSONResponse(
{
"id": completion_id,
"object": "chat.completion",
"created": created,
"model": MODEL_ID,
"choices": [
{
"index": 0,
"message": message,
"finish_reason": finish_reason
}
],
"usage": usage
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment