ccm commited on
Commit
864f881
·
1 Parent(s): 3f686b5

Moving way more over to modularize

Browse files
agent_server/chat_completions.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import typing
5
+
6
+ from agent_server.agent_streaming import run_agent_stream
7
+ from agent_server.formatting_reasoning import (
8
+ _extract_final_text,
9
+ _maybe_parse_final_from_stdout,
10
+ _format_reasoning_chunk,
11
+ )
12
+ from agent_server.helpers import normalize_content_to_text, now_ts
13
+ from agent_server.openai_schemas import ChatCompletionRequest, ChatMessage
14
+ from agent_server.sanitizing_think_tags import scrub_think_tags
15
+ from agents.code_writing_agents import (
16
+ generate_code_writing_agent_without_tools,
17
+ generate_code_writing_agent_with_search,
18
+ )
19
+ from agents.generator_and_critic import generate_generator_with_managed_critic
20
+ from agents.json_tool_calling_agents import (
21
+ generate_tool_calling_agent_with_search_and_code,
22
+ )
23
+
24
+
25
+ AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
26
+
27
+
28
+ def _normalize_model_name(raw_model: typing.Union[str, dict, None]) -> str:
29
+ """
30
+ Accepts either a bare model string or {"id": "..."} form; default to the
31
+ local code-writing agent if unspecified.
32
+ """
33
+ if isinstance(raw_model, dict):
34
+ return typing.cast(str, raw_model.get("id", "code-writing-agent-without-tools"))
35
+ if isinstance(raw_model, str) and raw_model.strip():
36
+ return raw_model
37
+ return "code-writing-agent-without-tools"
38
+
39
+
40
+ def _is_upstream_passthrough(model_name: str) -> bool:
41
+ return model_name == AGENT_MODEL
42
+
43
+
44
+ def _is_upstream_passthrough_nothink(model_name: str) -> bool:
45
+ return model_name == f"{AGENT_MODEL}-nothink"
46
+
47
+
48
+ def _apply_nothink_to_body(
49
+ body: ChatCompletionRequest, messages: typing.List[ChatMessage]
50
+ ) -> ChatCompletionRequest:
51
+ """
52
+ Mutates message content to request 'no-think' behavior upstream.
53
+ - Sets body["model"] to AGENT_MODEL (strip -nothink)
54
+ - Appends '/nothink' to user message content
55
+ """
56
+ new_body: ChatCompletionRequest = dict(body) # shallow copy is fine
57
+ new_body["model"] = AGENT_MODEL
58
+
59
+ new_messages: typing.List[ChatMessage] = []
60
+ for msg in messages:
61
+ if msg.get("role") == "user":
62
+ content = normalize_content_to_text(msg.get("content", ""))
63
+ new_messages.append({"role": "user", "content": content + "\n/nothink"})
64
+ else:
65
+ new_messages.append(msg)
66
+ new_body["messages"] = new_messages
67
+ return new_body
68
+
69
+
70
+ def _agent_for_model(model_name: str):
71
+ """
72
+ Returns an instantiated agent for the given local model id.
73
+ Raises ValueError on unknown local ids.
74
+ """
75
+ if model_name == "code-writing-agent-without-tools":
76
+ return generate_code_writing_agent_without_tools()
77
+ if model_name == "code-writing-agent-with-search":
78
+ return generate_code_writing_agent_with_search()
79
+ if model_name == "tool-calling-agent-with-search-and-code":
80
+ return generate_tool_calling_agent_with_search_and_code()
81
+ if model_name == "generator-with-managed-critic":
82
+ return generate_generator_with_managed_critic()
83
+ raise ValueError(f"Unknown model id: {model_name}")
84
+
85
+
86
+ def _openai_stream_base(model_name: str) -> dict:
87
+ """
88
+ The base chunk used for all SSE deltas in streaming mode.
89
+ """
90
+ return {
91
+ "id": f"chatcmpl-smol-{now_ts()}",
92
+ "object": "chat.completion.chunk",
93
+ "created": now_ts(),
94
+ "model": model_name,
95
+ "choices": [
96
+ {
97
+ "index": 0,
98
+ "delta": {"role": "assistant"},
99
+ "finish_reason": None,
100
+ }
101
+ ],
102
+ }
103
+
104
+
105
+ def _safe_extract_candidate(val: typing.Any) -> typing.Optional[str]:
106
+ """
107
+ Extracts a candidate final text string if present and non-empty.
108
+ """
109
+ cand = _extract_final_text(val)
110
+ if cand and cand.strip().lower() != "none":
111
+ return cand
112
+ return None
113
+
114
+
115
+ def _truncate_reasoning_blob(reasoning: str, limit: int = 24000) -> str:
116
+ if len(reasoning) > limit:
117
+ return reasoning[:limit] + "\n… [truncated]"
118
+ return reasoning
119
+
120
+
121
+ def _make_sse_generator(
122
+ task: str,
123
+ agent_for_request: typing.Any,
124
+ model_name: str,
125
+ ):
126
+ """
127
+ Returns an async generator that yields SSE 'data:' lines for FastAPI StreamingResponse.
128
+ """
129
+
130
+ async def _gen():
131
+ base = _openai_stream_base(model_name)
132
+
133
+ # initial role header
134
+ yield f"data: {json.dumps(base)}\n\n"
135
+
136
+ reasoning_idx = 0
137
+ final_candidate: typing.Optional[str] = None
138
+
139
+ async for item in run_agent_stream(task, agent_for_request):
140
+ # Short-circuit on explicit error signaled by the runner
141
+ if isinstance(item, dict) and "__error__" in item:
142
+ error_chunk = {
143
+ **base,
144
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}],
145
+ }
146
+ yield f"data: {json.dumps(error_chunk)}\n\n"
147
+ yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
148
+ break
149
+
150
+ # Explicit final (do not emit yet; keep last candidate)
151
+ if isinstance(item, dict) and "__final__" in item:
152
+ cand = _safe_extract_candidate(item["__final__"])
153
+ if cand:
154
+ final_candidate = cand
155
+ continue
156
+
157
+ # Live stdout -> reasoning_content
158
+ if (
159
+ isinstance(item, dict)
160
+ and "__stdout__" in item
161
+ and isinstance(item["__stdout__"], str)
162
+ ):
163
+ for line in item["__stdout__"].splitlines():
164
+ parsed = _maybe_parse_final_from_stdout(line)
165
+ if parsed:
166
+ final_candidate = parsed
167
+ rt = _format_reasoning_chunk(
168
+ line, "stdout", reasoning_idx := reasoning_idx + 1
169
+ )
170
+ if rt:
171
+ r_chunk = {
172
+ **base,
173
+ "choices": [
174
+ {"index": 0, "delta": {"reasoning_content": rt}}
175
+ ],
176
+ }
177
+ yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
178
+ continue
179
+
180
+ # Observed step -> reasoning_content
181
+ if (
182
+ isinstance(item, dict)
183
+ and "__step__" in item
184
+ and isinstance(item["__step__"], str)
185
+ ):
186
+ for line in item["__step__"].splitlines():
187
+ parsed = _maybe_parse_final_from_stdout(line)
188
+ if parsed:
189
+ final_candidate = parsed
190
+ rt = _format_reasoning_chunk(
191
+ line, "step", reasoning_idx := reasoning_idx + 1
192
+ )
193
+ if rt:
194
+ r_chunk = {
195
+ **base,
196
+ "choices": [
197
+ {"index": 0, "delta": {"reasoning_content": rt}}
198
+ ],
199
+ }
200
+ yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
201
+ continue
202
+
203
+ # Any other iterable/text from agent -> candidate answer
204
+ cand = _safe_extract_candidate(item)
205
+ if cand:
206
+ final_candidate = cand
207
+
208
+ # Cooperative scheduling
209
+ await asyncio.sleep(0)
210
+
211
+ # Emit visible answer once at the end (scrub any stray tags)
212
+ visible = scrub_think_tags(final_candidate or "")
213
+ if not visible or visible.strip().lower() == "none":
214
+ visible = "Done."
215
+ final_chunk = {**base, "choices": [{"index": 0, "delta": {"content": visible}}]}
216
+ yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
217
+
218
+ stop_chunk = {
219
+ **base,
220
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
221
+ }
222
+ yield f"data: {json.dumps(stop_chunk)}\n\n"
223
+ yield "data: [DONE]\n\n"
224
+
225
+ return _gen
226
+
227
+
228
+ async def _run_non_streaming(task: str, agent_for_request: typing.Any) -> str:
229
+ """
230
+ Runs the agent and returns a single OpenAI-style text (with optional <think> block).
231
+ """
232
+ reasoning_lines: typing.List[str] = []
233
+ final_candidate: typing.Optional[str] = None
234
+
235
+ async for item in run_agent_stream(task, agent_for_request):
236
+ if isinstance(item, dict) and "__error__" in item:
237
+ raise Exception(item["__error__"])
238
+
239
+ if isinstance(item, dict) and "__final__" in item:
240
+ cand = _safe_extract_candidate(item["__final__"])
241
+ if cand:
242
+ final_candidate = cand
243
+ continue
244
+
245
+ if isinstance(item, dict) and "__stdout__" in item:
246
+ lines = scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
247
+ for line in lines:
248
+ parsed = _maybe_parse_final_from_stdout(line)
249
+ if parsed:
250
+ final_candidate = parsed
251
+ rt = _format_reasoning_chunk(line, "stdout", len(reasoning_lines) + 1)
252
+ if rt:
253
+ reasoning_lines.append(rt)
254
+ continue
255
+
256
+ if isinstance(item, dict) and "__step__" in item:
257
+ lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
258
+ for line in lines:
259
+ parsed = _maybe_parse_final_from_stdout(line)
260
+ if parsed:
261
+ final_candidate = parsed
262
+ rt = _format_reasoning_chunk(line, "step", len(reasoning_lines) + 1)
263
+ if rt:
264
+ reasoning_lines.append(rt)
265
+ continue
266
+
267
+ cand = _safe_extract_candidate(item)
268
+ if cand:
269
+ final_candidate = cand
270
+
271
+ reasoning_blob = _truncate_reasoning_blob("\n".join(reasoning_lines).strip())
272
+ think_block = f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
273
+ final_text = scrub_think_tags(final_candidate or "")
274
+ if not final_text or final_text.strip().lower() == "none":
275
+ final_text = "Done."
276
+ return f"{think_block}{final_text}"
agent_server/helpers.py CHANGED
@@ -91,3 +91,7 @@ def _sse_headers() -> dict:
91
  "Connection": "keep-alive",
92
  "X-Accel-Buffering": "no",
93
  }
 
 
 
 
 
91
  "Connection": "keep-alive",
92
  "X-Accel-Buffering": "no",
93
  }
94
+
95
+
96
+ def now_ts() -> int:
97
+ return int(time.time())
agent_server/models.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import agent_server.helpers
3
+
4
+
5
+ def _models_payload() -> dict:
6
+ """
7
+ Returns the /v1/models response payload.
8
+ """
9
+ AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
10
+ now = agent_server.helpers.now_ts()
11
+ return {
12
+ "object": "list",
13
+ "data": [
14
+ {
15
+ "id": "generator-with-managed-critic",
16
+ "object": "model",
17
+ "created": now,
18
+ "owned_by": "you",
19
+ },
20
+ {
21
+ "id": "tool-calling-agent-with-search-and-code",
22
+ "object": "model",
23
+ "created": now,
24
+ "owned_by": "you",
25
+ },
26
+ {
27
+ "id": "code-writing-agent-without-tools",
28
+ "object": "model",
29
+ "created": now,
30
+ "owned_by": "you",
31
+ },
32
+ {
33
+ "id": "code-writing-agent-with-search",
34
+ "object": "model",
35
+ "created": now,
36
+ "owned_by": "you",
37
+ },
38
+ {
39
+ "id": AGENT_MODEL,
40
+ "object": "model",
41
+ "created": now,
42
+ "owned_by": "upstream",
43
+ },
44
+ {
45
+ "id": f"{AGENT_MODEL}-nothink",
46
+ "object": "model",
47
+ "created": now,
48
+ "owned_by": "upstream",
49
+ },
50
+ ],
51
+ }
proxy.py CHANGED
@@ -4,9 +4,6 @@ Refactored for readability and modularity (single-file).
4
  """
5
 
6
  import os # For dealing with env vars
7
- import json # For JSON handling
8
- import time # For timestamps and sleeps
9
- import asyncio # For async operations
10
  import typing # For type annotations
11
  import logging # For logging
12
 
@@ -16,32 +13,26 @@ import fastapi.responses
16
 
17
  # Upstream pass-through + local helpers
18
  from agent_server.agent_streaming import (
19
- run_agent_stream,
20
  _proxy_upstream_chat_completions,
21
  )
22
- from agent_server.formatting_reasoning import (
23
- _format_reasoning_chunk,
24
- _extract_final_text,
25
- _maybe_parse_final_from_stdout,
 
 
 
 
26
  )
27
  from agent_server.helpers import (
28
- normalize_content_to_text,
29
  _messages_to_task,
30
  _openai_response,
31
  _sse_headers,
32
  )
 
33
  from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
34
- from agent_server.sanitizing_think_tags import scrub_think_tags
35
 
36
  # Local agent factories
37
- from agents.code_writing_agents import (
38
- generate_code_writing_agent_without_tools,
39
- generate_code_writing_agent_with_search,
40
- )
41
- from agents.json_tool_calling_agents import (
42
- generate_tool_calling_agent_with_search_and_code,
43
- )
44
- from agents.generator_and_critic import generate_generator_with_managed_critic
45
 
46
  # --------------------------------------------------------------------------------------
47
  # Logging / Config
@@ -49,323 +40,12 @@ from agents.generator_and_critic import generate_generator_with_managed_critic
49
  logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
50
  log = logging.getLogger(__name__)
51
 
52
- AGENT_MODEL = os.getenv("AGENT_MODEL", "Qwen/Qwen3-1.7B")
53
-
54
  # --------------------------------------------------------------------------------------
55
  # FastAPI app
56
  # --------------------------------------------------------------------------------------
57
  app = fastapi.FastAPI()
58
 
59
 
60
- # --------------------------------------------------------------------------------------
61
- # Utility helpers (pure functions)
62
- # --------------------------------------------------------------------------------------
63
- def _now_ts() -> int:
64
- return int(time.time())
65
-
66
-
67
- def _normalize_model_name(raw_model: typing.Union[str, dict, None]) -> str:
68
- """
69
- Accepts either a bare model string or {"id": "..."} form; default to the
70
- local code-writing agent if unspecified.
71
- """
72
- if isinstance(raw_model, dict):
73
- return typing.cast(str, raw_model.get("id", "code-writing-agent-without-tools"))
74
- if isinstance(raw_model, str) and raw_model.strip():
75
- return raw_model
76
- return "code-writing-agent-without-tools"
77
-
78
-
79
- def _is_upstream_passthrough(model_name: str) -> bool:
80
- return model_name == AGENT_MODEL
81
-
82
-
83
- def _is_upstream_passthrough_nothink(model_name: str) -> bool:
84
- return model_name == f"{AGENT_MODEL}-nothink"
85
-
86
-
87
- def _apply_nothink_to_body(
88
- body: ChatCompletionRequest, messages: typing.List[ChatMessage]
89
- ) -> ChatCompletionRequest:
90
- """
91
- Mutates message content to request 'no-think' behavior upstream.
92
- - Sets body["model"] to AGENT_MODEL (strip -nothink)
93
- - Appends '/nothink' to user message content
94
- """
95
- new_body: ChatCompletionRequest = dict(body) # shallow copy is fine
96
- new_body["model"] = AGENT_MODEL
97
-
98
- new_messages: typing.List[ChatMessage] = []
99
- for msg in messages:
100
- if msg.get("role") == "user":
101
- content = normalize_content_to_text(msg.get("content", ""))
102
- new_messages.append({"role": "user", "content": content + "\n/nothink"})
103
- else:
104
- new_messages.append(msg)
105
- new_body["messages"] = new_messages
106
- return new_body
107
-
108
-
109
- def _models_payload() -> dict:
110
- """
111
- Returns the /v1/models response payload.
112
- """
113
- now = _now_ts()
114
- return {
115
- "object": "list",
116
- "data": [
117
- {
118
- "id": "generator-with-managed-critic",
119
- "object": "model",
120
- "created": now,
121
- "owned_by": "you",
122
- },
123
- {
124
- "id": "tool-calling-agent-with-search-and-code",
125
- "object": "model",
126
- "created": now,
127
- "owned_by": "you",
128
- },
129
- {
130
- "id": "code-writing-agent-without-tools",
131
- "object": "model",
132
- "created": now,
133
- "owned_by": "you",
134
- },
135
- {
136
- "id": "code-writing-agent-with-search",
137
- "object": "model",
138
- "created": now,
139
- "owned_by": "you",
140
- },
141
- {
142
- "id": AGENT_MODEL,
143
- "object": "model",
144
- "created": now,
145
- "owned_by": "upstream",
146
- },
147
- {
148
- "id": f"{AGENT_MODEL}-nothink",
149
- "object": "model",
150
- "created": now,
151
- "owned_by": "upstream",
152
- },
153
- ],
154
- }
155
-
156
-
157
- def _agent_for_model(model_name: str):
158
- """
159
- Returns an instantiated agent for the given local model id.
160
- Raises ValueError on unknown local ids.
161
- """
162
- if model_name == "code-writing-agent-without-tools":
163
- return generate_code_writing_agent_without_tools()
164
- if model_name == "code-writing-agent-with-search":
165
- return generate_code_writing_agent_with_search()
166
- if model_name == "tool-calling-agent-with-search-and-code":
167
- return generate_tool_calling_agent_with_search_and_code()
168
- if model_name == "generator-with-managed-critic":
169
- return generate_generator_with_managed_critic()
170
- raise ValueError(f"Unknown model id: {model_name}")
171
-
172
-
173
- def _openai_stream_base(model_name: str) -> dict:
174
- """
175
- The base chunk used for all SSE deltas in streaming mode.
176
- """
177
- return {
178
- "id": f"chatcmpl-smol-{_now_ts()}",
179
- "object": "chat.completion.chunk",
180
- "created": _now_ts(),
181
- "model": model_name,
182
- "choices": [
183
- {
184
- "index": 0,
185
- "delta": {"role": "assistant"},
186
- "finish_reason": None,
187
- }
188
- ],
189
- }
190
-
191
-
192
- def _safe_extract_candidate(val: typing.Any) -> typing.Optional[str]:
193
- """
194
- Extracts a candidate final text string if present and non-empty.
195
- """
196
- cand = _extract_final_text(val)
197
- if cand and cand.strip().lower() != "none":
198
- return cand
199
- return None
200
-
201
-
202
- def _truncate_reasoning_blob(reasoning: str, limit: int = 24000) -> str:
203
- if len(reasoning) > limit:
204
- return reasoning[:limit] + "\n… [truncated]"
205
- return reasoning
206
-
207
-
208
- # --------------------------------------------------------------------------------------
209
- # Streaming + non-streaming execution
210
- # --------------------------------------------------------------------------------------
211
- def _make_sse_generator(
212
- task: str,
213
- agent_for_request: typing.Any,
214
- model_name: str,
215
- ):
216
- """
217
- Returns an async generator that yields SSE 'data:' lines for FastAPI StreamingResponse.
218
- """
219
-
220
- async def _gen():
221
- base = _openai_stream_base(model_name)
222
-
223
- # initial role header
224
- yield f"data: {json.dumps(base)}\n\n"
225
-
226
- reasoning_idx = 0
227
- final_candidate: typing.Optional[str] = None
228
-
229
- async for item in run_agent_stream(task, agent_for_request):
230
- # Short-circuit on explicit error signaled by the runner
231
- if isinstance(item, dict) and "__error__" in item:
232
- error_chunk = {
233
- **base,
234
- "choices": [{"index": 0, "delta": {}, "finish_reason": "error"}],
235
- }
236
- yield f"data: {json.dumps(error_chunk)}\n\n"
237
- yield f"data: {json.dumps({'error': item['__error__']})}\n\n"
238
- break
239
-
240
- # Explicit final (do not emit yet; keep last candidate)
241
- if isinstance(item, dict) and "__final__" in item:
242
- cand = _safe_extract_candidate(item["__final__"])
243
- if cand:
244
- final_candidate = cand
245
- continue
246
-
247
- # Live stdout -> reasoning_content
248
- if (
249
- isinstance(item, dict)
250
- and "__stdout__" in item
251
- and isinstance(item["__stdout__"], str)
252
- ):
253
- for line in item["__stdout__"].splitlines():
254
- parsed = _maybe_parse_final_from_stdout(line)
255
- if parsed:
256
- final_candidate = parsed
257
- rt = _format_reasoning_chunk(
258
- line, "stdout", reasoning_idx := reasoning_idx + 1
259
- )
260
- if rt:
261
- r_chunk = {
262
- **base,
263
- "choices": [
264
- {"index": 0, "delta": {"reasoning_content": rt}}
265
- ],
266
- }
267
- yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
268
- continue
269
-
270
- # Observed step -> reasoning_content
271
- if (
272
- isinstance(item, dict)
273
- and "__step__" in item
274
- and isinstance(item["__step__"], str)
275
- ):
276
- for line in item["__step__"].splitlines():
277
- parsed = _maybe_parse_final_from_stdout(line)
278
- if parsed:
279
- final_candidate = parsed
280
- rt = _format_reasoning_chunk(
281
- line, "step", reasoning_idx := reasoning_idx + 1
282
- )
283
- if rt:
284
- r_chunk = {
285
- **base,
286
- "choices": [
287
- {"index": 0, "delta": {"reasoning_content": rt}}
288
- ],
289
- }
290
- yield f"data: {json.dumps(r_chunk, ensure_ascii=False)}\n\n"
291
- continue
292
-
293
- # Any other iterable/text from agent -> candidate answer
294
- cand = _safe_extract_candidate(item)
295
- if cand:
296
- final_candidate = cand
297
-
298
- # Cooperative scheduling
299
- await asyncio.sleep(0)
300
-
301
- # Emit visible answer once at the end (scrub any stray tags)
302
- visible = scrub_think_tags(final_candidate or "")
303
- if not visible or visible.strip().lower() == "none":
304
- visible = "Done."
305
- final_chunk = {**base, "choices": [{"index": 0, "delta": {"content": visible}}]}
306
- yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
307
-
308
- stop_chunk = {
309
- **base,
310
- "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
311
- }
312
- yield f"data: {json.dumps(stop_chunk)}\n\n"
313
- yield "data: [DONE]\n\n"
314
-
315
- return _gen
316
-
317
-
318
- async def _run_non_streaming(task: str, agent_for_request: typing.Any) -> str:
319
- """
320
- Runs the agent and returns a single OpenAI-style text (with optional <think> block).
321
- """
322
- reasoning_lines: typing.List[str] = []
323
- final_candidate: typing.Optional[str] = None
324
-
325
- async for item in run_agent_stream(task, agent_for_request):
326
- if isinstance(item, dict) and "__error__" in item:
327
- raise Exception(item["__error__"])
328
-
329
- if isinstance(item, dict) and "__final__" in item:
330
- cand = _safe_extract_candidate(item["__final__"])
331
- if cand:
332
- final_candidate = cand
333
- continue
334
-
335
- if isinstance(item, dict) and "__stdout__" in item:
336
- lines = scrub_think_tags(item["__stdout__"]).rstrip("\n").splitlines()
337
- for line in lines:
338
- parsed = _maybe_parse_final_from_stdout(line)
339
- if parsed:
340
- final_candidate = parsed
341
- rt = _format_reasoning_chunk(line, "stdout", len(reasoning_lines) + 1)
342
- if rt:
343
- reasoning_lines.append(rt)
344
- continue
345
-
346
- if isinstance(item, dict) and "__step__" in item:
347
- lines = scrub_think_tags(item["__step__"]).rstrip("\n").splitlines()
348
- for line in lines:
349
- parsed = _maybe_parse_final_from_stdout(line)
350
- if parsed:
351
- final_candidate = parsed
352
- rt = _format_reasoning_chunk(line, "step", len(reasoning_lines) + 1)
353
- if rt:
354
- reasoning_lines.append(rt)
355
- continue
356
-
357
- cand = _safe_extract_candidate(item)
358
- if cand:
359
- final_candidate = cand
360
-
361
- reasoning_blob = _truncate_reasoning_blob("\n".join(reasoning_lines).strip())
362
- think_block = f"<think>\n{reasoning_blob}\n</think>\n" if reasoning_blob else ""
363
- final_text = scrub_think_tags(final_candidate or "")
364
- if not final_text or final_text.strip().lower() == "none":
365
- final_text = "Done."
366
- return f"{think_block}{final_text}"
367
-
368
-
369
  # --------------------------------------------------------------------------------------
370
  # HTTP Handlers (thin wrappers around helpers)
371
  # --------------------------------------------------------------------------------------
 
4
  """
5
 
6
  import os # For dealing with env vars
 
 
 
7
  import typing # For type annotations
8
  import logging # For logging
9
 
 
13
 
14
  # Upstream pass-through + local helpers
15
  from agent_server.agent_streaming import (
 
16
  _proxy_upstream_chat_completions,
17
  )
18
+ from agent_server.chat_completions import (
19
+ _normalize_model_name,
20
+ _is_upstream_passthrough,
21
+ _is_upstream_passthrough_nothink,
22
+ _apply_nothink_to_body,
23
+ _agent_for_model,
24
+ _make_sse_generator,
25
+ _run_non_streaming,
26
  )
27
  from agent_server.helpers import (
 
28
  _messages_to_task,
29
  _openai_response,
30
  _sse_headers,
31
  )
32
+ from agent_server.models import _models_payload
33
  from agent_server.openai_schemas import ChatMessage, ChatCompletionRequest
 
34
 
35
  # Local agent factories
 
 
 
 
 
 
 
 
36
 
37
  # --------------------------------------------------------------------------------------
38
  # Logging / Config
 
40
  logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
41
  log = logging.getLogger(__name__)
42
 
 
 
43
  # --------------------------------------------------------------------------------------
44
  # FastAPI app
45
  # --------------------------------------------------------------------------------------
46
  app = fastapi.FastAPI()
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # --------------------------------------------------------------------------------------
50
  # HTTP Handlers (thin wrappers around helpers)
51
  # --------------------------------------------------------------------------------------