ccm commited on
Commit
3f686b5
·
1 Parent(s): 2fabb0d

Adding gitignore and modularizing the proxy file

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. proxy.py +342 -258
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .venv
2
+ .idea
proxy.py CHANGED
@@ -1,5 +1,6 @@
1
  """
2
  OpenAI-compatible FastAPI proxy that wraps a smolagents CodeAgent
 
3
  """
4
 
5
  import os # For dealing with env vars
@@ -9,48 +10,107 @@ import asyncio # For async operations
9
  import typing # For type annotations
10
  import logging # For logging
11
 
 
12
  import fastapi
13
  import fastapi.responses
14
 
15
- # Upstream pass-through
16
- from agent_server.agent_streaming import run_agent_stream, _proxy_upstream_chat_completions
17
- from agent_server.formatting_reasoning import _format_reasoning_chunk, _extract_final_text, \
18
- _maybe_parse_final_from_stdout
19
- from agent_server.helpers import normalize_content_to_text, _messages_to_task, _openai_response, _sse_headers
 
 
 
 
 
 
 
 
 
 
 
20
  from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
21
  from agent_server.sanitizing_think_tags import scrub_think_tags
 
 
22
  from agents.code_writing_agents import (
23
  generate_code_writing_agent_without_tools,
24
  generate_code_writing_agent_with_search,
25
  )
26
-
27
  from agents.json_tool_calling_agents import (
28
  generate_tool_calling_agent_with_search_and_code,
29
  )
30
-
31
  from agents.generator_and_critic import generate_generator_with_managed_critic
32
 
33
- # Logging setup
 
 
34
  logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
35
  log = logging.getLogger(__name__)
36
 
37
  AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
38
 
39
- # ================== FastAPI ==================
 
 
40
  app = fastapi.FastAPI()
41
 
42
 
43
- @app.get("/healthz")
44
- async def healthz():
45
- return {"ok": True}
 
 
46
 
47
- # ---------- Agent streaming bridge (truly live) ----------
48
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # ---------- Endpoints ----------
51
- @app.get("/v1/models")
52
- async def list_models():
53
- now = int(time.time())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  return {
55
  "object": "list",
56
  "data": [
@@ -85,7 +145,7 @@ async def list_models():
85
  "owned_by": "upstream",
86
  },
87
  {
88
- "id": AGENT_MODEL + "-nothink",
89
  "object": "model",
90
  "created": now,
91
  "owned_by": "upstream",
@@ -94,8 +154,234 @@ async def list_models():
94
  }
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  @app.post("/v1/chat/completions")
98
  async def chat_completions(req: fastapi.Request):
 
99
  try:
100
  body: ChatCompletionRequest = typing.cast(
101
  ChatCompletionRequest, await req.json()
@@ -105,247 +391,53 @@ async def chat_completions(req: fastapi.Request):
105
  {"error": {"message": f"Invalid JSON: {e}"}}, status_code=400
106
  )
107
 
108
- messages = body.get("messages") or []
109
- stream = bool(body.get("stream", False))
110
- raw_model = body.get("model")
111
- model_name = (
112
- raw_model.get("id")
113
- if isinstance(raw_model, dict)
114
- else (raw_model or "code-writing-agent-without-tools")
115
  )
116
- # Pure pass-through if the user selects the upstream model id
117
- if model_name == AGENT_MODEL:
118
- return await _proxy_upstream_chat_completions(dict(body), stream)
119
- if model_name == AGENT_MODEL + "-nothink":
120
- # Remove "-nothink" from the model name in body
121
- body["model"] = AGENT_MODEL
122
-
123
- # Add /nothink to the end of the message contents to disable think tags
124
- new_messages = []
125
- for msg in messages:
126
- if msg.get("role") == "user":
127
- content = normalize_content_to_text(msg.get("content", ""))
128
- content += "\n/nothink"
129
- new_msg: ChatMessage = {
130
- "role": "user",
131
- "content": content,
132
- }
133
- new_messages.append(new_msg)
134
- else:
135
- new_messages.append(msg)
136
- body["messages"] = new_messages
137
- return await _proxy_upstream_chat_completions(
138
- dict(body), stream, scrub_think=True
139
- )
140
-
141
- # Otherwise, reasoning-aware wrapper
142
- task = _messages_to_task(messages)
143
-
144
- # Per-request agent override if a custom model id was provided (different from defaults)
145
- agent_for_request = None
146
- if model_name not in (
147
- AGENT_MODEL,
148
- AGENT_MODEL + "-nothink",
149
- ) and isinstance(model_name, str):
150
- if model_name == "code-writing-agent-without-tools":
151
- agent_for_request = generate_code_writing_agent_without_tools()
152
- elif model_name == "code-writing-agent-with-search":
153
- agent_for_request = generate_code_writing_agent_with_search()
154
- elif model_name == "tool-calling-agent-with-search-and-code":
155
- agent_for_request = generate_tool_calling_agent_with_search_and_code()
156
- elif model_name == "generator-with-managed-critic":
157
- agent_for_request = generate_generator_with_managed_critic()
158
- else:
159
- # Emit error for unknown model
160
- return fastapi.responses.JSONResponse(
161
- status_code=400,
162
- content={
163
- "error": {
164
- "message": f"Unknown model id: {model_name}",
165
- "type": "invalid_request_error",
166
- }
167
- },
168
- )
169
 
170
  try:
171
- if stream:
172
-
173
- async def sse_streamer():
174
- base = {
175
- "id": f"chatcmpl-smol-{int(time.time())}",
176
- "object": "chat.completion.chunk",
177
- "created": int(time.time()),
178
- "model": model_name,
179
- "choices": [
180
- {
181
- "index": 0,
182
- "delta": {"role": "assistant"},
183
- "finish_reason": None,
184
- }
185
- ],
186
- }
187
- yield f"data: {json.dumps(base)}\n\n"
188
 
189
- reasoning_idx = 0
190
- final_candidate: typing.Optional[str] = None
 
191
 
192
- async for item in run_agent_stream(task, agent_for_request):
193
- # Error short-circuit
194
- if isinstance(item, dict) and "__error__" in item:
195
- error_chunk = {
196
- **base,
197
- "choices": [
198
- {"index": 0, "delta": {}, "finish_reason": "error"}
199
- ],
200
- }
201
- yield f"data: {json.dumps(error_chunk)}\n\n"
202
- yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
203
- break
204
-
205
- # Explicit final result from the agent
206
- if isinstance(item, dict) and "__final__" in item:
207
- val = item["__final__"]
208
- cand = _extract_final_text(val)
209
- # Only update if the agent actually provided a non-empty answer
210
- if cand and cand.strip().lower() != "none":
211
- final_candidate = cand
212
- # do not emit anything yet; we'll send a single final chunk below
213
- continue
214
-
215
- # Live stdout -> reasoning_content
216
- if (
217
- isinstance(item, dict)
218
- and "__stdout__" in item
219
- and isinstance(item["__stdout__"], str)
220
- ):
221
- for line in item["__stdout__"].splitlines():
222
- parsed = _maybe_parse_final_from_stdout(line)
223
- if parsed:
224
- final_candidate = parsed
225
- rt = _format_reasoning_chunk(
226
- line, "stdout", reasoning_idx := reasoning_idx + 1
227
- )
228
- if rt:
229
- r_chunk = {
230
- **base,
231
- "choices": [
232
- {"index": 0, "delta": {"reasoning_content": rt}}
233
- ],
234
- }
235
- yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
236
- continue
237
-
238
- # Newly observed step -> reasoning_content
239
- if (
240
- isinstance(item, dict)
241
- and "__step__" in item
242
- and isinstance(item["__step__"], str)
243
- ):
244
- for line in item["__step__"].splitlines():
245
- parsed = _maybe_parse_final_from_stdout(line)
246
- if parsed:
247
- final_candidate = parsed
248
- rt = _format_reasoning_chunk(
249
- line, "step", reasoning_idx := reasoning_idx + 1
250
- )
251
- if rt:
252
- r_chunk = {
253
- **base,
254
- "choices": [
255
- {"index": 0, "delta": {"reasoning_content": rt}}
256
- ],
257
- }
258
- yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
259
- continue
260
-
261
- # Any iterable output from the agent (rare) — treat as candidate answer
262
- cand = _extract_final_text(item)
263
- if cand:
264
- final_candidate = cand
265
-
266
- await asyncio.sleep(0) # keep the loop fair
267
-
268
- # Emit the visible answer once at the end (scrub any stray tags)
269
- visible = scrub_think_tags(final_candidate or "")
270
- if not visible or visible.strip().lower() == "none":
271
- visible = "Done."
272
- final_chunk = {
273
- **base,
274
- "choices": [{"index": 0, "delta": {"content": visible}}],
275
- }
276
- yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
277
-
278
- stop_chunk = {
279
- **base,
280
- "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
281
- }
282
- yield f"data: {json.dumps(stop_chunk)}\n\n"
283
- yield "data: [DONE]\n\n"
284
 
 
 
 
285
  return fastapi.responses.StreamingResponse(
286
- sse_streamer(), media_type="text/event-stream", headers=_sse_headers()
287
  )
288
-
289
  else:
290
- # Non-streaming: collect into <think>…</think> + final
291
- reasoning_lines: typing.List[str] = []
292
- final_candidate: typing.Optional[str] = None
293
-
294
- async for item in run_agent_stream(task, agent_for_request):
295
- if isinstance(item, dict) and "__error__" in item:
296
- raise Exception(item["__error__"])
297
-
298
- if isinstance(item, dict) and "__final__" in item:
299
- val = item["__final__"]
300
- cand = _extract_final_text(val)
301
- if cand and cand.strip().lower() != "none":
302
- final_candidate = cand
303
- continue
304
-
305
- if isinstance(item, dict) and "__stdout__" in item:
306
- lines = (
307
- scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
308
- )
309
- for line in lines:
310
- parsed = _maybe_parse_final_from_stdout(line)
311
- if parsed:
312
- final_candidate = parsed
313
- rt = _format_reasoning_chunk(
314
- line, "stdout", len(reasoning_lines) + 1
315
- )
316
- if rt:
317
- reasoning_lines.append(rt)
318
- continue
319
-
320
- if isinstance(item, dict) and "__step__" in item:
321
- lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
322
- for line in lines:
323
- parsed = _maybe_parse_final_from_stdout(line)
324
- if parsed:
325
- final_candidate = parsed
326
- rt = _format_reasoning_chunk(
327
- line, "step", len(reasoning_lines) + 1
328
- )
329
- if rt:
330
- reasoning_lines.append(rt)
331
- continue
332
-
333
- cand = _extract_final_text(item)
334
- if cand:
335
- final_candidate = cand
336
-
337
- reasoning_blob = "\n".join(reasoning_lines).strip()
338
- if len(reasoning_blob) > 24000:
339
- reasoning_blob = reasoning_blob[:24000] + "\n… [truncated]"
340
- think_block = (
341
- f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
342
  )
343
- final_text = scrub_think_tags(final_candidate or "")
344
- if not final_text or final_text.strip().lower() == "none":
345
- final_text = "Done."
346
- result_text = f"{think_block}{final_text}"
347
 
 
 
 
 
 
 
 
348
  except Exception as e:
 
349
  msg = str(e)
350
  status = 503 if "503" in msg or "Service Unavailable" in msg else 500
351
  log.error("Agent error (%s): %s", status, msg)
@@ -356,18 +448,10 @@ async def chat_completions(req: fastapi.Request):
356
  },
357
  )
358
 
359
- # Non-streaming response
360
- if result_text is None:
361
- result_text = ""
362
- if not isinstance(result_text, str):
363
- try:
364
- result_text = json.dumps(result_text, ensure_ascii=False)
365
- except Exception:
366
- result_text = str(result_text)
367
- return fastapi.responses.JSONResponse(_openai_response(result_text, model_name))
368
-
369
 
370
- # Optional: local run
 
 
371
  if __name__ == "__main__":
372
  import uvicorn
373
 
 
1
  """
2
  OpenAI-compatible FastAPI proxy that wraps a smolagents CodeAgent
3
+ Refactored for readability and modularity (single-file).
4
  """
5
 
6
  import os # For dealing with env vars
 
10
  import typing # For type annotations
11
  import logging # For logging
12
 
13
+
14
  import fastapi
15
  import fastapi.responses
16
 
17
+ # Upstream pass-through + local helpers
18
+ from agent_server.agent_streaming import (
19
+ run_agent_stream,
20
+ _proxy_upstream_chat_completions,
21
+ )
22
+ from agent_server.formatting_reasoning import (
23
+ _format_reasoning_chunk,
24
+ _extract_final_text,
25
+ _maybe_parse_final_from_stdout,
26
+ )
27
+ from agent_server.helpers import (
28
+ normalize_content_to_text,
29
+ _messages_to_task,
30
+ _openai_response,
31
+ _sse_headers,
32
+ )
33
  from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
34
  from agent_server.sanitizing_think_tags import scrub_think_tags
35
+
36
+ # Local agent factories
37
  from agents.code_writing_agents import (
38
  generate_code_writing_agent_without_tools,
39
  generate_code_writing_agent_with_search,
40
  )
 
41
  from agents.json_tool_calling_agents import (
42
  generate_tool_calling_agent_with_search_and_code,
43
  )
 
44
  from agents.generator_and_critic import generate_generator_with_managed_critic
45
 
46
+ # --------------------------------------------------------------------------------------
47
+ # Logging / Config
48
+ # --------------------------------------------------------------------------------------
49
  logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
50
  log = logging.getLogger(__name__)
51
 
52
  AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
53
 
54
+ # --------------------------------------------------------------------------------------
55
+ # FastAPI app
56
+ # --------------------------------------------------------------------------------------
57
  app = fastapi.FastAPI()
58
 
59
 
60
+ # --------------------------------------------------------------------------------------
61
+ # Utility helpers (pure functions)
62
+ # --------------------------------------------------------------------------------------
63
+ def _now_ts() -> int:
64
+ return int(time.time())
65
 
 
66
 
67
+ def _normalize_model_name(raw_model: typing.Union[str, dict, None]) -> str:
68
+ """
69
+ Accepts either a bare model string or {"id": "..."} form; default to the
70
+ local code-writing agent if unspecified.
71
+ """
72
+ if isinstance(raw_model, dict):
73
+ return typing.cast(str, raw_model.get("id", "code-writing-agent-without-tools"))
74
+ if isinstance(raw_model, str) and raw_model.strip():
75
+ return raw_model
76
+ return "code-writing-agent-without-tools"
77
 
78
+
79
+ def _is_upstream_passthrough(model_name: str) -> bool:
80
+ return model_name == AGENT_MODEL
81
+
82
+
83
+ def _is_upstream_passthrough_nothink(model_name: str) -> bool:
84
+ return model_name == f"{AGENT_MODEL}-nothink"
85
+
86
+
87
+ def _apply_nothink_to_body(
88
+ body: ChatCompletionRequest, messages: typing.List[ChatMessage]
89
+ ) -> ChatCompletionRequest:
90
+ """
91
+ Mutates message content to request 'no-think' behavior upstream.
92
+ - Sets body["model"] to AGENT_MODEL (strip -nothink)
93
+ - Appends '/nothink' to user message content
94
+ """
95
+ new_body: ChatCompletionRequest = dict(body) # shallow copy is fine
96
+ new_body["model"] = AGENT_MODEL
97
+
98
+ new_messages: typing.List[ChatMessage] = []
99
+ for msg in messages:
100
+ if msg.get("role") == "user":
101
+ content = normalize_content_to_text(msg.get("content", ""))
102
+ new_messages.append({"role": "user", "content": content + "\n/nothink"})
103
+ else:
104
+ new_messages.append(msg)
105
+ new_body["messages"] = new_messages
106
+ return new_body
107
+
108
+
109
+ def _models_payload() -> dict:
110
+ """
111
+ Returns the /v1/models response payload.
112
+ """
113
+ now = _now_ts()
114
  return {
115
  "object": "list",
116
  "data": [
 
145
  "owned_by": "upstream",
146
  },
147
  {
148
+ "id": f"{AGENT_MODEL}-nothink",
149
  "object": "model",
150
  "created": now,
151
  "owned_by": "upstream",
 
154
  }
155
 
156
 
157
+ def _agent_for_model(model_name: str):
158
+ """
159
+ Returns an instantiated agent for the given local model id.
160
+ Raises ValueError on unknown local ids.
161
+ """
162
+ if model_name == "code-writing-agent-without-tools":
163
+ return generate_code_writing_agent_without_tools()
164
+ if model_name == "code-writing-agent-with-search":
165
+ return generate_code_writing_agent_with_search()
166
+ if model_name == "tool-calling-agent-with-search-and-code":
167
+ return generate_tool_calling_agent_with_search_and_code()
168
+ if model_name == "generator-with-managed-critic":
169
+ return generate_generator_with_managed_critic()
170
+ raise ValueError(f"Unknown model id: {model_name}")
171
+
172
+
173
+ def _openai_stream_base(model_name: str) -> dict:
174
+ """
175
+ The base chunk used for all SSE deltas in streaming mode.
176
+ """
177
+ return {
178
+ "id": f"chatcmpl-smol-{_now_ts()}",
179
+ "object": "chat.completion.chunk",
180
+ "created": _now_ts(),
181
+ "model": model_name,
182
+ "choices": [
183
+ {
184
+ "index": 0,
185
+ "delta": {"role": "assistant"},
186
+ "finish_reason": None,
187
+ }
188
+ ],
189
+ }
190
+
191
+
192
+ def _safe_extract_candidate(val: typing.Any) -> typing.Optional[str]:
193
+ """
194
+ Extracts a candidate final text string if present and non-empty.
195
+ """
196
+ cand = _extract_final_text(val)
197
+ if cand and cand.strip().lower() != "none":
198
+ return cand
199
+ return None
200
+
201
+
202
+ def _truncate_reasoning_blob(reasoning: str, limit: int = 24000) -> str:
203
+ if len(reasoning) > limit:
204
+ return reasoning[:limit] + "\n… [truncated]"
205
+ return reasoning
206
+
207
+
208
+ # --------------------------------------------------------------------------------------
209
+ # Streaming + non-streaming execution
210
+ # --------------------------------------------------------------------------------------
211
+ def _make_sse_generator(
212
+ task: str,
213
+ agent_for_request: typing.Any,
214
+ model_name: str,
215
+ ):
216
+ """
217
+ Returns an async generator that yields SSE 'data:' lines for FastAPI StreamingResponse.
218
+ """
219
+
220
+ async def _gen():
221
+ base = _openai_stream_base(model_name)
222
+
223
+ # initial role header
224
+ yield f"data: {json.dumps(base)}\n\n"
225
+
226
+ reasoning_idx = 0
227
+ final_candidate: typing.Optional[str] = None
228
+
229
+ async for item in run_agent_stream(task, agent_for_request):
230
+ # Short-circuit on explicit error signaled by the runner
231
+ if isinstance(item, dict) and "__error__" in item:
232
+ error_chunk = {
233
+ **base,
234
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}],
235
+ }
236
+ yield f"data: {json.dumps(error_chunk)}\n\n"
237
+ yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
238
+ break
239
+
240
+ # Explicit final (do not emit yet; keep last candidate)
241
+ if isinstance(item, dict) and "__final__" in item:
242
+ cand = _safe_extract_candidate(item["__final__"])
243
+ if cand:
244
+ final_candidate = cand
245
+ continue
246
+
247
+ # Live stdout -> reasoning_content
248
+ if (
249
+ isinstance(item, dict)
250
+ and "__stdout__" in item
251
+ and isinstance(item["__stdout__"], str)
252
+ ):
253
+ for line in item["__stdout__"].splitlines():
254
+ parsed = _maybe_parse_final_from_stdout(line)
255
+ if parsed:
256
+ final_candidate = parsed
257
+ rt = _format_reasoning_chunk(
258
+ line, "stdout", reasoning_idx := reasoning_idx + 1
259
+ )
260
+ if rt:
261
+ r_chunk = {
262
+ **base,
263
+ "choices": [
264
+ {"index": 0, "delta": {"reasoning_content": rt}}
265
+ ],
266
+ }
267
+ yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
268
+ continue
269
+
270
+ # Observed step -> reasoning_content
271
+ if (
272
+ isinstance(item, dict)
273
+ and "__step__" in item
274
+ and isinstance(item["__step__"], str)
275
+ ):
276
+ for line in item["__step__"].splitlines():
277
+ parsed = _maybe_parse_final_from_stdout(line)
278
+ if parsed:
279
+ final_candidate = parsed
280
+ rt = _format_reasoning_chunk(
281
+ line, "step", reasoning_idx := reasoning_idx + 1
282
+ )
283
+ if rt:
284
+ r_chunk = {
285
+ **base,
286
+ "choices": [
287
+ {"index": 0, "delta": {"reasoning_content": rt}}
288
+ ],
289
+ }
290
+ yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
291
+ continue
292
+
293
+ # Any other iterable/text from agent -> candidate answer
294
+ cand = _safe_extract_candidate(item)
295
+ if cand:
296
+ final_candidate = cand
297
+
298
+ # Cooperative scheduling
299
+ await asyncio.sleep(0)
300
+
301
+ # Emit visible answer once at the end (scrub any stray tags)
302
+ visible = scrub_think_tags(final_candidate or "")
303
+ if not visible or visible.strip().lower() == "none":
304
+ visible = "Done."
305
+ final_chunk = {**base, "choices": [{"index": 0, "delta": {"content": visible}}]}
306
+ yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
307
+
308
+ stop_chunk = {
309
+ **base,
310
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
311
+ }
312
+ yield f"data: {json.dumps(stop_chunk)}\n\n"
313
+ yield "data: [DONE]\n\n"
314
+
315
+ return _gen
316
+
317
+
318
+ async def _run_non_streaming(task: str, agent_for_request: typing.Any) -> str:
319
+ """
320
+ Runs the agent and returns a single OpenAI-style text (with optional <think> block).
321
+ """
322
+ reasoning_lines: typing.List[str] = []
323
+ final_candidate: typing.Optional[str] = None
324
+
325
+ async for item in run_agent_stream(task, agent_for_request):
326
+ if isinstance(item, dict) and "__error__" in item:
327
+ raise Exception(item["__error__"])
328
+
329
+ if isinstance(item, dict) and "__final__" in item:
330
+ cand = _safe_extract_candidate(item["__final__"])
331
+ if cand:
332
+ final_candidate = cand
333
+ continue
334
+
335
+ if isinstance(item, dict) and "__stdout__" in item:
336
+ lines = scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
337
+ for line in lines:
338
+ parsed = _maybe_parse_final_from_stdout(line)
339
+ if parsed:
340
+ final_candidate = parsed
341
+ rt = _format_reasoning_chunk(line, "stdout", len(reasoning_lines) + 1)
342
+ if rt:
343
+ reasoning_lines.append(rt)
344
+ continue
345
+
346
+ if isinstance(item, dict) and "__step__" in item:
347
+ lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
348
+ for line in lines:
349
+ parsed = _maybe_parse_final_from_stdout(line)
350
+ if parsed:
351
+ final_candidate = parsed
352
+ rt = _format_reasoning_chunk(line, "step", len(reasoning_lines) + 1)
353
+ if rt:
354
+ reasoning_lines.append(rt)
355
+ continue
356
+
357
+ cand = _safe_extract_candidate(item)
358
+ if cand:
359
+ final_candidate = cand
360
+
361
+ reasoning_blob = _truncate_reasoning_blob("\n".join(reasoning_lines).strip())
362
+ think_block = f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
363
+ final_text = scrub_think_tags(final_candidate or "")
364
+ if not final_text or final_text.strip().lower() == "none":
365
+ final_text = "Done."
366
+ return f"{think_block}{final_text}"
367
+
368
+
369
+ # --------------------------------------------------------------------------------------
370
+ # HTTP Handlers (thin wrappers around helpers)
371
+ # --------------------------------------------------------------------------------------
372
+ @app.get("/healthz")
373
+ async def healthz():
374
+ return {"ok": True}
375
+
376
+
377
+ @app.get("/v1/models")
378
+ async def list_models():
379
+ return _models_payload()
380
+
381
+
382
  @app.post("/v1/chat/completions")
383
  async def chat_completions(req: fastapi.Request):
384
+ # ---------------- Parse & basic validation ----------------
385
  try:
386
  body: ChatCompletionRequest = typing.cast(
387
  ChatCompletionRequest, await req.json()
 
391
  {"error": {"message": f"Invalid JSON: {e}"}}, status_code=400
392
  )
393
 
394
+ messages: typing.List[ChatMessage] = typing.cast(
395
+ typing.List[ChatMessage], body.get("messages") or []
 
 
 
 
 
396
  )
397
+ stream: bool = bool(body.get("stream", False))
398
+ model_name: str = _normalize_model_name(body.get("model"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  try:
401
+ # ---------------- Upstream pass-through modes ----------------
402
+ if _is_upstream_passthrough(model_name):
403
+ # Raw pass-through to upstream
404
+ return await _proxy_upstream_chat_completions(dict(body), stream)
405
+
406
+ if _is_upstream_passthrough_nothink(model_name):
407
+ # Modify body for /nothink and forward to upstream
408
+ return await _proxy_upstream_chat_completions(
409
+ _apply_nothink_to_body(body, messages), stream, scrub_think=True
410
+ )
 
 
 
 
 
 
 
411
 
412
+ # ---------------- Local agent execution ----------------
413
+ # Convert OpenAI messages -> internal "task"
414
+ task: str = _messages_to_task(messages)
415
 
416
+ # Create agent impl for the requested local model
417
+ agent_for_request = _agent_for_model(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ if stream:
420
+ # Streaming: return SSE response
421
+ gen = _make_sse_generator(task, agent_for_request, model_name)
422
  return fastapi.responses.StreamingResponse(
423
+ gen(), media_type="text/event-stream", headers=_sse_headers()
424
  )
 
425
  else:
426
+ # Non-streaming: materialize final text and wrap in OpenAI shape
427
+ result_text = await _run_non_streaming(task, agent_for_request)
428
+ return fastapi.responses.JSONResponse(
429
+ _openai_response(result_text, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  )
 
 
 
 
431
 
432
+ except ValueError as ve:
433
+ # Unknown model or other parameter validation errors
434
+ log.error("Invalid request: %s", ve)
435
+ return fastapi.responses.JSONResponse(
436
+ status_code=400,
437
+ content={"error": {"message": str(ve), "type": "invalid_request_error"}},
438
+ )
439
  except Exception as e:
440
+ # Operational / agent runtime errors
441
  msg = str(e)
442
  status = 503 if "503" in msg or "Service Unavailable" in msg else 500
443
  log.error("Agent error (%s): %s", status, msg)
 
448
  },
449
  )
450
 
 
 
 
 
 
 
 
 
 
 
451
 
452
+ # --------------------------------------------------------------------------------------
453
+ # Local dev entrypoint
454
+ # --------------------------------------------------------------------------------------
455
  if __name__ == "__main__":
456
  import uvicorn
457