Created
April 14, 2026 14:16
-
-
Save marvs/f8535b4ff9d9b6351fbf8c690b9ff883 to your computer and use it in GitHub Desktop.
Simple Python script that wraps a local LLM with an OpenAI-compatible API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| import re | |
| import time | |
| import uuid | |
| from threading import Thread | |
| from typing import Any, Dict, List, Optional | |
| from fastapi import FastAPI, Header, HTTPException | |
| from fastapi.responses import JSONResponse, StreamingResponse | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, TextIteratorStreamer | |
| from optimum.intel import OVModelForCausalLM | |
| # ---------------------------- | |
| # Config | |
| # ---------------------------- | |
| MODEL_ID = "qwen25-coder-7b" | |
| MODEL_DIR = "qwen25-coder-7b-ov" # change if your folder name differs | |
| BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| DEVICE = "GPU" # change to "CPU" if needed | |
| API_KEY = os.environ.get("LOCAL_OPENAI_API_KEY", "local-dev-key") | |
| FALLBACK_SYSTEM_PROMPT = ( | |
| "You are a concise and capable coding assistant. " | |
| "Prefer correct, practical answers. " | |
| "For code, keep explanations tight unless asked for detail." | |
| ) | |
| TOOL_USE_SYSTEM_PROMPT = ( | |
| "You are a coding assistant operating in agent mode. " | |
| "You have access to tools to read and write files. " | |
| "CRITICAL RULES:\n" | |
| "- You MUST use tools to perform actions. NEVER describe or simulate actions in text.\n" | |
| "- NEVER say 'I have created', 'I have saved', or 'I have updated' without calling a tool.\n" | |
| "- NEVER output file contents in a code block as a substitute for calling a tool.\n" | |
| "- When asked to create or edit a file, respond ONLY with the tool call JSON. No prose before or after.\n" | |
| "- After a tool result is returned to you, respond with a short plain text confirmation. Do NOT call the tool again.\n" | |
| "- A tool call must be a raw JSON object in this exact format:\n" | |
| "```json\n{\"name\": \"tool_name\", \"arguments\": {\"param\": \"value\"}}\n```" | |
| ) | |
| # ---------------------------- | |
| # Load model once at startup | |
| # ---------------------------- | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID) | |
| model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device=DEVICE) | |
| # ---------------------------- | |
| # FastAPI app | |
| # ---------------------------- | |
| app = FastAPI(title="Local OpenAI-Compatible Server") | |
| # ---------------------------- | |
| # Request schemas | |
| # ---------------------------- | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: Any | |
| class ChatCompletionRequest(BaseModel): | |
| model: str | |
| messages: List[ChatMessage] | |
| stream: Optional[bool] = False | |
| temperature: Optional[float] = None | |
| top_p: Optional[float] = None | |
| max_tokens: Optional[int] = 512 | |
| tools: Optional[List[Dict[str, Any]]] = None | |
| tool_choice: Optional[Any] = None | |
| # ---------------------------- | |
| # Helpers | |
| # ---------------------------- | |
| def require_auth(authorization: Optional[str]) -> None: | |
| if not authorization or not authorization.startswith("Bearer "): | |
| raise HTTPException(status_code=401, detail="Missing bearer token") | |
| token = authorization.removeprefix("Bearer ").strip() | |
| if token != API_KEY: | |
| raise HTTPException(status_code=401, detail="Invalid API key") | |
| def normalize_content(content: Any) -> str: | |
| if content is None: | |
| return "" | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| parts = [] | |
| for item in content: | |
| if isinstance(item, dict): | |
| if item.get("type") == "text": | |
| parts.append(item.get("text", "")) | |
| elif "content" in item: | |
| parts.append(str(item["content"])) | |
| else: | |
| parts.append(str(item)) | |
| return "\n".join(x for x in parts if x) | |
| return str(content) | |
| def build_messages(messages: List[ChatMessage], tools=None) -> List[Dict[str, str]]: | |
| """ | |
| Preserve incoming system messages from the client (Continue Agent mode relies on them). | |
| Only add a fallback/tool-use system prompt if the client sent none. | |
| """ | |
| out: List[Dict[str, str]] = [] | |
| for m in messages: | |
| role = m.role | |
| if role not in ("system", "user", "assistant", "tool"): # "tool" allowed through | |
| continue | |
| content = normalize_content(m.content) | |
| if not content or not content.strip(): | |
| continue | |
| out.append({"role": role, "content": content.strip()}) | |
| if not any(msg["role"] == "system" for msg in out): | |
| system_prompt = TOOL_USE_SYSTEM_PROMPT if tools else FALLBACK_SYSTEM_PROMPT | |
| out.insert(0, {"role": "system", "content": system_prompt}) | |
| return out | |
| def maybe_inject_tool_reminder(messages: List[Dict[str, str]], tools) -> List[Dict[str, str]]: | |
| """ | |
| If the last user message implies a file action and tools are available, | |
| append a reminder right before generation so the model doesn't respond in prose. | |
| Skip if the last message is a tool result — the model should wrap up, not call again. | |
| """ | |
| if not tools: | |
| return messages | |
| # If the last message is a tool result, don't push the model to call another tool | |
| last_msg = messages[-1] if messages else None | |
| if last_msg and last_msg["role"] == "tool": | |
| return messages | |
| last_user = next( | |
| (m for m in reversed(messages) if m["role"] == "user"), None | |
| ) | |
| if not last_user: | |
| return messages | |
| action_keywords = ["create", "save", "write", "edit", "update", "make", "add"] | |
| if any(kw in last_user["content"].lower() for kw in action_keywords): | |
| messages = messages.copy() | |
| messages.append({ | |
| "role": "user", | |
| "content": ( | |
| "Remember: do not describe the action in text. " | |
| "Call the appropriate tool directly using a ```json tool call." | |
| ) | |
| }) | |
| return messages | |
| def make_prompt(messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None) -> str: | |
| """ | |
| Pass tools into the chat template so Qwen2.5-Instruct knows how to emit tool calls. | |
| """ | |
| kwargs = dict( | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| if tools: | |
| kwargs["tools"] = tools | |
| return tokenizer.apply_chat_template(messages, **kwargs) | |
| def clean_response(text: str) -> str: | |
| if text is None: | |
| return "" | |
| text = str(text) | |
| for marker in ["<|im_end|>", "<|endoftext|>", "User:", "Assistant:"]: | |
| if marker in text: | |
| text = text.split(marker)[0] | |
| return text.strip() | |
| def parse_tool_calls(text: str): | |
| """ | |
| Detect tool calls in two formats: | |
| 1. Qwen native: <tool_call>{"name": ..., "arguments": ...}</tool_call> | |
| 2. Fallback: ```json\n{"name": ..., "arguments": ...}\n``` | |
| """ | |
| # Format 1: native Qwen tags | |
| pattern = r"<tool_call>(.*?)</tool_call>" | |
| matches = re.findall(pattern, text, re.DOTALL) | |
| # Format 2: ```json code block fallback | |
| if not matches: | |
| pattern = r"```json\s*(\{.*?\})\s*```" | |
| matches = re.findall(pattern, text, re.DOTALL) | |
| if not matches: | |
| return None, text | |
| tool_calls = [] | |
| for match in matches: | |
| try: | |
| parsed = json.loads(match.strip()) | |
| # Support both {"name": ..., "arguments": ...} | |
| # and {"name": ..., "parameters": ...} (some models vary) | |
| arguments = parsed.get("arguments") or parsed.get("parameters") or {} | |
| tool_calls.append({ | |
| "id": f"call_{uuid.uuid4().hex[:8]}", | |
| "type": "function", | |
| "function": { | |
| "name": parsed.get("name", ""), | |
| "arguments": json.dumps(arguments) | |
| } | |
| }) | |
| except json.JSONDecodeError: | |
| pass | |
| if not tool_calls: | |
| return None, text | |
| return tool_calls, None | |
| def build_generation_kwargs(inputs, max_new_tokens: int, streamer=None): | |
| kwargs = dict( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=False, | |
| repetition_penalty=1.05, | |
| eos_token_id=tokenizer.eos_token_id, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| if streamer is not None: | |
| kwargs["streamer"] = streamer | |
| return kwargs | |
| def log_messages(messages: List[Dict[str, str]]) -> None: | |
| print("----- INCOMING MESSAGES -----") | |
| for msg in messages: | |
| print(f"{msg['role'].upper()}:") | |
| print(msg["content"][:4000]) | |
| print("-----") | |
| # ---------------------------- | |
| # Routes | |
| # ---------------------------- | |
| @app.get("/health") | |
| def health(): | |
| return {"ok": True, "model": MODEL_ID, "device": DEVICE} | |
| @app.get("/v1/models") | |
| def list_models(authorization: Optional[str] = Header(None)): | |
| require_auth(authorization) | |
| return { | |
| "object": "list", | |
| "data": [ | |
| { | |
| "id": MODEL_ID, | |
| "object": "model", | |
| "owned_by": "local" | |
| } | |
| ] | |
| } | |
| @app.post("/v1/chat/completions") | |
| def chat_completions( | |
| body: ChatCompletionRequest, | |
| authorization: Optional[str] = Header(None) | |
| ): | |
| require_auth(authorization) | |
| if body.model != MODEL_ID: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Unknown model '{body.model}'. Expected '{MODEL_ID}'." | |
| ) | |
| messages = build_messages(body.messages, tools=body.tools) | |
| messages = maybe_inject_tool_reminder(messages, tools=body.tools) | |
| log_messages(messages) | |
| prompt = make_prompt(messages, tools=body.tools) | |
| print("----- TOOLS RECEIVED -----") | |
| print(json.dumps(body.tools, indent=2) if body.tools else "NO TOOLS") | |
| print("----- PROMPT SENT TO MODEL -----") | |
| print(prompt[:3000]) | |
| print("---------------------------------") | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| max_new_tokens = min(max(body.max_tokens or 512, 1), 2048) | |
| completion_id = f"chatcmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| # Streaming mode | |
| if body.stream: | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True | |
| ) | |
| generation_kwargs = build_generation_kwargs( | |
| inputs=inputs, | |
| max_new_tokens=max_new_tokens, | |
| streamer=streamer, | |
| ) | |
| def event_stream(): | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Buffer the full response first so we can detect tool calls | |
| full_response = "" | |
| try: | |
| for chunk in streamer: | |
| if chunk: | |
| full_response += chunk | |
| finally: | |
| thread.join() | |
| full_response = clean_response(full_response) | |
| print("----- RAW MODEL OUTPUT (stream) -----") | |
| print(full_response[:2000]) | |
| print("-------------------------------------") | |
| tool_calls, plain_content = parse_tool_calls(full_response) | |
| # Tool call response — emit as a single SSE chunk | |
| if tool_calls: | |
| payload = { | |
| "id": completion_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "delta": { | |
| "role": "assistant", | |
| "content": None, | |
| "tool_calls": [ | |
| { | |
| **tc, | |
| "function": { | |
| "name": tc["function"]["name"], | |
| "arguments": tc["function"]["arguments"] | |
| } | |
| } | |
| for tc in tool_calls | |
| ] | |
| }, | |
| "finish_reason": "tool_calls" | |
| } | |
| ] | |
| } | |
| yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return | |
| # Plain text — re-emit word by word so Continue still sees streaming | |
| words = re.split(r'(\s+)', plain_content) | |
| for word in words: | |
| if not word: | |
| continue | |
| payload = { | |
| "id": completion_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "delta": {"content": word}, | |
| "finish_reason": None | |
| } | |
| ] | |
| } | |
| yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n" | |
| final_payload = { | |
| "id": completion_id, | |
| "object": "chat.completion.chunk", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "delta": {}, | |
| "finish_reason": "stop" | |
| } | |
| ] | |
| } | |
| yield f"data: {json.dumps(final_payload, ensure_ascii=False)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(event_stream(), media_type="text/event-stream") | |
| # Non-streaming mode | |
| generation_kwargs = build_generation_kwargs( | |
| inputs=inputs, | |
| max_new_tokens=max_new_tokens, | |
| streamer=None, | |
| ) | |
| start = time.time() | |
| output = model.generate(**generation_kwargs) | |
| end = time.time() | |
| input_len = inputs["input_ids"].shape[1] | |
| generated_ids = output[0][input_len:] | |
| response_text = clean_response( | |
| tokenizer.decode(generated_ids, skip_special_tokens=True) | |
| ) | |
| print("----- RAW MODEL OUTPUT (non-stream) -----") | |
| print(response_text[:2000]) | |
| print("-----------------------------------------") | |
| tool_calls, plain_content = parse_tool_calls(response_text) | |
| if tool_calls: | |
| message = { | |
| "role": "assistant", | |
| "content": None, | |
| "tool_calls": tool_calls | |
| } | |
| finish_reason = "tool_calls" | |
| else: | |
| message = { | |
| "role": "assistant", | |
| "content": plain_content | |
| } | |
| finish_reason = "stop" | |
| usage = { | |
| "prompt_tokens": int(input_len), | |
| "completion_tokens": int(len(generated_ids)), | |
| "total_tokens": int(input_len + len(generated_ids)), | |
| } | |
| print( | |
| f"Model={MODEL_ID} Device={DEVICE} " | |
| f"Time={end - start:.2f}s " | |
| f"PromptTokens={usage['prompt_tokens']} " | |
| f"CompletionTokens={usage['completion_tokens']}" | |
| ) | |
| return JSONResponse( | |
| { | |
| "id": completion_id, | |
| "object": "chat.completion", | |
| "created": created, | |
| "model": MODEL_ID, | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "message": message, | |
| "finish_reason": finish_reason | |
| } | |
| ], | |
| "usage": usage | |
| } | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment