Spaces:

ccm
/

chat-ui-with-agent-examples

Sleeping

App Files Files Community

ccm commited on Nov 9

Commit

3f686b5

1 Parent(s): 2fabb0d

Adding gitignore and modularizing the proxy file

Browse files

Files changed (2) hide show

.gitignore +2 -0
proxy.py +342 -258

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv
2	+ .idea

proxy.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
 OpenAI-compatible FastAPI proxy that wraps a smolagents CodeAgent
 """
 import os  # For dealing with env vars
@@ -9,48 +10,107 @@ import asyncio  # For async operations
 import typing  # For type annotations
 import logging  # For logging
 import fastapi
 import fastapi.responses
-# Upstream pass-through
-from agent_server.agent_streaming import run_agent_stream, _proxy_upstream_chat_completions
-from agent_server.formatting_reasoning import _format_reasoning_chunk, _extract_final_text, \
-    _maybe_parse_final_from_stdout
-from agent_server.helpers import normalize_content_to_text, _messages_to_task, _openai_response, _sse_headers
 from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
 from agent_server.sanitizing_think_tags import scrub_think_tags
 from agents.code_writing_agents import (
     generate_code_writing_agent_without_tools,
     generate_code_writing_agent_with_search,
 )
 from agents.json_tool_calling_agents import (
     generate_tool_calling_agent_with_search_and_code,
 )
 from agents.generator_and_critic import generate_generator_with_managed_critic
-# Logging setup
 logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
 log = logging.getLogger(__name__)
 AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
-# ================== FastAPI ==================
 app = fastapi.FastAPI()
-@app.get("/healthz")
-async def healthz():
-    return {"ok": True}
-# ---------- Agent streaming bridge (truly live) ----------
-# ---------- Endpoints ----------
-@app.get("/v1/models")
-async def list_models():
-    now = int(time.time())
     return {
         "object": "list",
         "data": [
@@ -85,7 +145,7 @@ async def list_models():
                 "owned_by": "upstream",
             },
             {
-                "id": AGENT_MODEL + "-nothink",
                 "object": "model",
                 "created": now,
                 "owned_by": "upstream",
@@ -94,8 +154,234 @@ async def list_models():
     }
 @app.post("/v1/chat/completions")
 async def chat_completions(req: fastapi.Request):
     try:
         body: ChatCompletionRequest = typing.cast(
             ChatCompletionRequest, await req.json()
@@ -105,247 +391,53 @@ async def chat_completions(req: fastapi.Request):
             {"error": {"message": f"Invalid JSON: {e}"}}, status_code=400
         )
-    messages = body.get("messages") or []
-    stream = bool(body.get("stream", False))
-    raw_model = body.get("model")
-    model_name = (
-        raw_model.get("id")
-        if isinstance(raw_model, dict)
-        else (raw_model or "code-writing-agent-without-tools")
     )
-    # Pure pass-through if the user selects the upstream model id
-    if model_name == AGENT_MODEL:
-        return await _proxy_upstream_chat_completions(dict(body), stream)
-    if model_name == AGENT_MODEL + "-nothink":
-        # Remove "-nothink" from the model name  in body
-        body["model"] = AGENT_MODEL
-        # Add /nothink to the end of the message contents to disable think tags
-        new_messages = []
-        for msg in messages:
-            if msg.get("role") == "user":
-                content = normalize_content_to_text(msg.get("content", ""))
-                content += "\n/nothink"
-                new_msg: ChatMessage = {
-                    "role": "user",
-                    "content": content,
-                }
-                new_messages.append(new_msg)
-            else:
-                new_messages.append(msg)
-        body["messages"] = new_messages
-        return await _proxy_upstream_chat_completions(
-            dict(body), stream, scrub_think=True
-        )
-    # Otherwise, reasoning-aware wrapper
-    task = _messages_to_task(messages)
-    # Per-request agent override if a custom model id was provided (different from defaults)
-    agent_for_request = None
-    if model_name not in (
-        AGENT_MODEL,
-        AGENT_MODEL + "-nothink",
-    ) and isinstance(model_name, str):
-        if model_name == "code-writing-agent-without-tools":
-            agent_for_request = generate_code_writing_agent_without_tools()
-        elif model_name == "code-writing-agent-with-search":
-            agent_for_request = generate_code_writing_agent_with_search()
-        elif model_name == "tool-calling-agent-with-search-and-code":
-            agent_for_request = generate_tool_calling_agent_with_search_and_code()
-        elif model_name == "generator-with-managed-critic":
-            agent_for_request = generate_generator_with_managed_critic()
-        else:
-            # Emit error for unknown model
-            return fastapi.responses.JSONResponse(
-                status_code=400,
-                content={
-                    "error": {
-                        "message": f"Unknown model id: {model_name}",
-                        "type": "invalid_request_error",
-                    }
-                },
-            )
     try:
-        if stream:
-            async def sse_streamer():
-                base = {
-                    "id": f"chatcmpl-smol-{int(time.time())}",
-                    "object": "chat.completion.chunk",
-                    "created": int(time.time()),
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "index": 0,
-                            "delta": {"role": "assistant"},
-                            "finish_reason": None,
-                        }
-                    ],
-                }
-                yield f"data: {json.dumps(base)}\n\n"
-                reasoning_idx = 0
-                final_candidate: typing.Optional[str] = None
-                async for item in run_agent_stream(task, agent_for_request):
-                    # Error short-circuit
-                    if isinstance(item, dict) and "__error__" in item:
-                        error_chunk = {
-                            **base,
-                            "choices": [
-                                {"index": 0, "delta": {}, "finish_reason": "error"}
-                            ],
-                        }
-                        yield f"data: {json.dumps(error_chunk)}\n\n"
-                        yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
-                        break
-                    # Explicit final result from the agent
-                    if isinstance(item, dict) and "__final__" in item:
-                        val = item["__final__"]
-                        cand = _extract_final_text(val)
-                        # Only update if the agent actually provided a non-empty answer
-                        if cand and cand.strip().lower() != "none":
-                            final_candidate = cand
-                        # do not emit anything yet; we'll send a single final chunk below
-                        continue
-                    # Live stdout -> reasoning_content
-                    if (
-                        isinstance(item, dict)
-                        and "__stdout__" in item
-                        and isinstance(item["__stdout__"], str)
-                    ):
-                        for line in item["__stdout__"].splitlines():
-                            parsed = _maybe_parse_final_from_stdout(line)
-                            if parsed:
-                                final_candidate = parsed
-                            rt = _format_reasoning_chunk(
-                                line, "stdout", reasoning_idx := reasoning_idx + 1
-                            )
-                            if rt:
-                                r_chunk = {
-                                    **base,
-                                    "choices": [
-                                        {"index": 0, "delta": {"reasoning_content": rt}}
-                                    ],
-                                }
-                                yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
-                        continue
-                    # Newly observed step -> reasoning_content
-                    if (
-                        isinstance(item, dict)
-                        and "__step__" in item
-                        and isinstance(item["__step__"], str)
-                    ):
-                        for line in item["__step__"].splitlines():
-                            parsed = _maybe_parse_final_from_stdout(line)
-                            if parsed:
-                                final_candidate = parsed
-                            rt = _format_reasoning_chunk(
-                                line, "step", reasoning_idx := reasoning_idx + 1
-                            )
-                            if rt:
-                                r_chunk = {
-                                    **base,
-                                    "choices": [
-                                        {"index": 0, "delta": {"reasoning_content": rt}}
-                                    ],
-                                }
-                                yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
-                        continue
-                    # Any iterable output from the agent (rare) — treat as candidate answer
-                    cand = _extract_final_text(item)
-                    if cand:
-                        final_candidate = cand
-                    await asyncio.sleep(0)  # keep the loop fair
-                # Emit the visible answer once at the end (scrub any stray tags)
-                visible = scrub_think_tags(final_candidate or "")
-                if not visible or visible.strip().lower() == "none":
-                    visible = "Done."
-                final_chunk = {
-                    **base,
-                    "choices": [{"index": 0, "delta": {"content": visible}}],
-                }
-                yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
-                stop_chunk = {
-                    **base,
-                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
-                }
-                yield f"data: {json.dumps(stop_chunk)}\n\n"
-                yield "data: [DONE]\n\n"
             return fastapi.responses.StreamingResponse(
-                sse_streamer(), media_type="text/event-stream", headers=_sse_headers()
             )
         else:
-            # Non-streaming: collect into <think>…</think> + final
-            reasoning_lines: typing.List[str] = []
-            final_candidate: typing.Optional[str] = None
-            async for item in run_agent_stream(task, agent_for_request):
-                if isinstance(item, dict) and "__error__" in item:
-                    raise Exception(item["__error__"])
-                if isinstance(item, dict) and "__final__" in item:
-                    val = item["__final__"]
-                    cand = _extract_final_text(val)
-                    if cand and cand.strip().lower() != "none":
-                        final_candidate = cand
-                    continue
-                if isinstance(item, dict) and "__stdout__" in item:
-                    lines = (
-                        scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
-                    )
-                    for line in lines:
-                        parsed = _maybe_parse_final_from_stdout(line)
-                        if parsed:
-                            final_candidate = parsed
-                        rt = _format_reasoning_chunk(
-                            line, "stdout", len(reasoning_lines) + 1
-                        )
-                        if rt:
-                            reasoning_lines.append(rt)
-                    continue
-                if isinstance(item, dict) and "__step__" in item:
-                    lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
-                    for line in lines:
-                        parsed = _maybe_parse_final_from_stdout(line)
-                        if parsed:
-                            final_candidate = parsed
-                        rt = _format_reasoning_chunk(
-                            line, "step", len(reasoning_lines) + 1
-                        )
-                        if rt:
-                            reasoning_lines.append(rt)
-                    continue
-                cand = _extract_final_text(item)
-                if cand:
-                    final_candidate = cand
-            reasoning_blob = "\n".join(reasoning_lines).strip()
-            if len(reasoning_blob) > 24000:
-                reasoning_blob = reasoning_blob[:24000] + "\n… [truncated]"
-            think_block = (
-                f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
             )
-            final_text = scrub_think_tags(final_candidate or "")
-            if not final_text or final_text.strip().lower() == "none":
-                final_text = "Done."
-            result_text = f"{think_block}{final_text}"
     except Exception as e:
         msg = str(e)
         status = 503 if "503" in msg or "Service Unavailable" in msg else 500
         log.error("Agent error (%s): %s", status, msg)
@@ -356,18 +448,10 @@ async def chat_completions(req: fastapi.Request):
             },
         )
-    # Non-streaming response
-    if result_text is None:
-        result_text = ""
-    if not isinstance(result_text, str):
-        try:
-            result_text = json.dumps(result_text, ensure_ascii=False)
-        except Exception:
-            result_text = str(result_text)
-    return fastapi.responses.JSONResponse(_openai_response(result_text, model_name))
-# Optional: local run
 if __name__ == "__main__":
     import uvicorn

 """
 OpenAI-compatible FastAPI proxy that wraps a smolagents CodeAgent
+Refactored for readability and modularity (single-file).
 """
 import os  # For dealing with env vars
 import typing  # For type annotations
 import logging  # For logging
 import fastapi
 import fastapi.responses
+# Upstream pass-through + local helpers
+from agent_server.agent_streaming import (
+    run_agent_stream,
+    _proxy_upstream_chat_completions,
+)
+from agent_server.formatting_reasoning import (
+    _format_reasoning_chunk,
+    _extract_final_text,
+    _maybe_parse_final_from_stdout,
+)
+from agent_server.helpers import (
+    normalize_content_to_text,
+    _messages_to_task,
+    _openai_response,
+    _sse_headers,
+)
 from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
 from agent_server.sanitizing_think_tags import scrub_think_tags
+# Local agent factories
 from agents.code_writing_agents import (
     generate_code_writing_agent_without_tools,
     generate_code_writing_agent_with_search,
 )
 from agents.json_tool_calling_agents import (
     generate_tool_calling_agent_with_search_and_code,
 )
 from agents.generator_and_critic import generate_generator_with_managed_critic
+# --------------------------------------------------------------------------------------
+# Logging / Config
+# --------------------------------------------------------------------------------------
 logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
 log = logging.getLogger(__name__)
 AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
+# --------------------------------------------------------------------------------------
+# FastAPI app
+# --------------------------------------------------------------------------------------
 app = fastapi.FastAPI()
+# --------------------------------------------------------------------------------------
+# Utility helpers (pure functions)
+# --------------------------------------------------------------------------------------
+def _now_ts() -> int:
+    return int(time.time())
+def _normalize_model_name(raw_model: typing.Union[str, dict, None]) -> str:
+    """
+    Accepts either a bare model string or {"id": "..."} form; default to the
+    local code-writing agent if unspecified.
+    """
+    if isinstance(raw_model, dict):
+        return typing.cast(str, raw_model.get("id", "code-writing-agent-without-tools"))
+    if isinstance(raw_model, str) and raw_model.strip():
+        return raw_model
+    return "code-writing-agent-without-tools"
+def _is_upstream_passthrough(model_name: str) -> bool:
+    return model_name == AGENT_MODEL
+def _is_upstream_passthrough_nothink(model_name: str) -> bool:
+    return model_name == f"{AGENT_MODEL}-nothink"
+def _apply_nothink_to_body(
+    body: ChatCompletionRequest, messages: typing.List[ChatMessage]
+) -> ChatCompletionRequest:
+    """
+    Mutates message content to request 'no-think' behavior upstream.
+    - Sets body["model"] to AGENT_MODEL (strip -nothink)
+    - Appends '/nothink' to user message content
+    """
+    new_body: ChatCompletionRequest = dict(body)  # shallow copy is fine
+    new_body["model"] = AGENT_MODEL
+    new_messages: typing.List[ChatMessage] = []
+    for msg in messages:
+        if msg.get("role") == "user":
+            content = normalize_content_to_text(msg.get("content", ""))
+            new_messages.append({"role": "user", "content": content + "\n/nothink"})
+        else:
+            new_messages.append(msg)
+    new_body["messages"] = new_messages
+    return new_body
+def _models_payload() -> dict:
+    """
+    Returns the /v1/models response payload.
+    """
+    now = _now_ts()
     return {
         "object": "list",
         "data": [
                 "owned_by": "upstream",
             },
             {
+                "id": f"{AGENT_MODEL}-nothink",
                 "object": "model",
                 "created": now,
                 "owned_by": "upstream",
     }
+def _agent_for_model(model_name: str):
+    """
+    Returns an instantiated agent for the given local model id.
+    Raises ValueError on unknown local ids.
+    """
+    if model_name == "code-writing-agent-without-tools":
+        return generate_code_writing_agent_without_tools()
+    if model_name == "code-writing-agent-with-search":
+        return generate_code_writing_agent_with_search()
+    if model_name == "tool-calling-agent-with-search-and-code":
+        return generate_tool_calling_agent_with_search_and_code()
+    if model_name == "generator-with-managed-critic":
+        return generate_generator_with_managed_critic()
+    raise ValueError(f"Unknown model id: {model_name}")
+def _openai_stream_base(model_name: str) -> dict:
+    """
+    The base chunk used for all SSE deltas in streaming mode.
+    """
+    return {
+        "id": f"chatcmpl-smol-{_now_ts()}",
+        "object": "chat.completion.chunk",
+        "created": _now_ts(),
+        "model": model_name,
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"role": "assistant"},
+                "finish_reason": None,
+            }
+        ],
+    }
+def _safe_extract_candidate(val: typing.Any) -> typing.Optional[str]:
+    """
+    Extracts a candidate final text string if present and non-empty.
+    """
+    cand = _extract_final_text(val)
+    if cand and cand.strip().lower() != "none":
+        return cand
+    return None
+def _truncate_reasoning_blob(reasoning: str, limit: int = 24000) -> str:
+    if len(reasoning) > limit:
+        return reasoning[:limit] + "\n… [truncated]"
+    return reasoning
+# --------------------------------------------------------------------------------------
+# Streaming + non-streaming execution
+# --------------------------------------------------------------------------------------
+def _make_sse_generator(
+    task: str,
+    agent_for_request: typing.Any,
+    model_name: str,
+):
+    """
+    Returns an async generator that yields SSE 'data:' lines for FastAPI StreamingResponse.
+    """
+    async def _gen():
+        base = _openai_stream_base(model_name)
+        # initial role header
+        yield f"data: {json.dumps(base)}\n\n"
+        reasoning_idx = 0
+        final_candidate: typing.Optional[str] = None
+        async for item in run_agent_stream(task, agent_for_request):
+            # Short-circuit on explicit error signaled by the runner
+            if isinstance(item, dict) and "__error__" in item:
+                error_chunk = {
+                    **base,
+                    "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}],
+                }
+                yield f"data: {json.dumps(error_chunk)}\n\n"
+                yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
+                break
+            # Explicit final (do not emit yet; keep last candidate)
+            if isinstance(item, dict) and "__final__" in item:
+                cand = _safe_extract_candidate(item["__final__"])
+                if cand:
+                    final_candidate = cand
+                continue
+            # Live stdout -> reasoning_content
+            if (
+                isinstance(item, dict)
+                and "__stdout__" in item
+                and isinstance(item["__stdout__"], str)
+            ):
+                for line in item["__stdout__"].splitlines():
+                    parsed = _maybe_parse_final_from_stdout(line)
+                    if parsed:
+                        final_candidate = parsed
+                    rt = _format_reasoning_chunk(
+                        line, "stdout", reasoning_idx := reasoning_idx + 1
+                    )
+                    if rt:
+                        r_chunk = {
+                            **base,
+                            "choices": [
+                                {"index": 0, "delta": {"reasoning_content": rt}}
+                            ],
+                        }
+                        yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
+                continue
+            # Observed step -> reasoning_content
+            if (
+                isinstance(item, dict)
+                and "__step__" in item
+                and isinstance(item["__step__"], str)
+            ):
+                for line in item["__step__"].splitlines():
+                    parsed = _maybe_parse_final_from_stdout(line)
+                    if parsed:
+                        final_candidate = parsed
+                    rt = _format_reasoning_chunk(
+                        line, "step", reasoning_idx := reasoning_idx + 1
+                    )
+                    if rt:
+                        r_chunk = {
+                            **base,
+                            "choices": [
+                                {"index": 0, "delta": {"reasoning_content": rt}}
+                            ],
+                        }
+                        yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
+                continue
+            # Any other iterable/text from agent -> candidate answer
+            cand = _safe_extract_candidate(item)
+            if cand:
+                final_candidate = cand
+            # Cooperative scheduling
+            await asyncio.sleep(0)
+        # Emit visible answer once at the end (scrub any stray tags)
+        visible = scrub_think_tags(final_candidate or "")
+        if not visible or visible.strip().lower() == "none":
+            visible = "Done."
+        final_chunk = {**base, "choices": [{"index": 0, "delta": {"content": visible}}]}
+        yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
+        stop_chunk = {
+            **base,
+            "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+        }
+        yield f"data: {json.dumps(stop_chunk)}\n\n"
+        yield "data: [DONE]\n\n"
+    return _gen
+async def _run_non_streaming(task: str, agent_for_request: typing.Any) -> str:
+    """
+    Runs the agent and returns a single OpenAI-style text (with optional <think> block).
+    """
+    reasoning_lines: typing.List[str] = []
+    final_candidate: typing.Optional[str] = None
+    async for item in run_agent_stream(task, agent_for_request):
+        if isinstance(item, dict) and "__error__" in item:
+            raise Exception(item["__error__"])
+        if isinstance(item, dict) and "__final__" in item:
+            cand = _safe_extract_candidate(item["__final__"])
+            if cand:
+                final_candidate = cand
+            continue
+        if isinstance(item, dict) and "__stdout__" in item:
+            lines = scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
+            for line in lines:
+                parsed = _maybe_parse_final_from_stdout(line)
+                if parsed:
+                    final_candidate = parsed
+                rt = _format_reasoning_chunk(line, "stdout", len(reasoning_lines) + 1)
+                if rt:
+                    reasoning_lines.append(rt)
+            continue
+        if isinstance(item, dict) and "__step__" in item:
+            lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
+            for line in lines:
+                parsed = _maybe_parse_final_from_stdout(line)
+                if parsed:
+                    final_candidate = parsed
+                rt = _format_reasoning_chunk(line, "step", len(reasoning_lines) + 1)
+                if rt:
+                    reasoning_lines.append(rt)
+            continue
+        cand = _safe_extract_candidate(item)
+        if cand:
+            final_candidate = cand
+    reasoning_blob = _truncate_reasoning_blob("\n".join(reasoning_lines).strip())
+    think_block = f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
+    final_text = scrub_think_tags(final_candidate or "")
+    if not final_text or final_text.strip().lower() == "none":
+        final_text = "Done."
+    return f"{think_block}{final_text}"
+# --------------------------------------------------------------------------------------
+# HTTP Handlers (thin wrappers around helpers)
+# --------------------------------------------------------------------------------------
+@app.get("/healthz")
+async def healthz():
+    return {"ok": True}
+@app.get("/v1/models")
+async def list_models():
+    return _models_payload()
 @app.post("/v1/chat/completions")
 async def chat_completions(req: fastapi.Request):
+    # ---------------- Parse & basic validation ----------------
     try:
         body: ChatCompletionRequest = typing.cast(
             ChatCompletionRequest, await req.json()
             {"error": {"message": f"Invalid JSON: {e}"}}, status_code=400
         )
+    messages: typing.List[ChatMessage] = typing.cast(
+        typing.List[ChatMessage], body.get("messages") or []
     )
+    stream: bool = bool(body.get("stream", False))
+    model_name: str = _normalize_model_name(body.get("model"))
     try:
+        # ---------------- Upstream pass-through modes ----------------
+        if _is_upstream_passthrough(model_name):
+            # Raw pass-through to upstream
+            return await _proxy_upstream_chat_completions(dict(body), stream)
+        if _is_upstream_passthrough_nothink(model_name):
+            # Modify body for /nothink and forward to upstream
+            return await _proxy_upstream_chat_completions(
+                _apply_nothink_to_body(body, messages), stream, scrub_think=True
+            )
+        # ---------------- Local agent execution ----------------
+        # Convert OpenAI messages -> internal "task"
+        task: str = _messages_to_task(messages)
+        # Create agent impl for the requested local model
+        agent_for_request = _agent_for_model(model_name)
+        if stream:
+            # Streaming: return SSE response
+            gen = _make_sse_generator(task, agent_for_request, model_name)
             return fastapi.responses.StreamingResponse(
+                gen(), media_type="text/event-stream", headers=_sse_headers()
             )
         else:
+            # Non-streaming: materialize final text and wrap in OpenAI shape
+            result_text = await _run_non_streaming(task, agent_for_request)
+            return fastapi.responses.JSONResponse(
+                _openai_response(result_text, model_name)
             )
+    except ValueError as ve:
+        # Unknown model or other parameter validation errors
+        log.error("Invalid request: %s", ve)
+        return fastapi.responses.JSONResponse(
+            status_code=400,
+            content={"error": {"message": str(ve), "type": "invalid_request_error"}},
+        )
     except Exception as e:
+        # Operational / agent runtime errors
         msg = str(e)
         status = 503 if "503" in msg or "Service Unavailable" in msg else 500
         log.error("Agent error (%s): %s", status, msg)
             },
         )
+# --------------------------------------------------------------------------------------
+# Local dev entrypoint
+# --------------------------------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn