github-actions[bot] commited on
Commit
cb5a5af
·
1 Parent(s): d778d65

🚀 Auto-deploy backend from GitHub (7408e56)

Browse files
Dockerfile CHANGED
@@ -23,6 +23,7 @@ RUN python -m pip install --upgrade pip setuptools wheel && \
23
 
24
  # Copy only runtime sources to reduce invalidation surface.
25
  COPY main.py /app/main.py
 
26
  COPY analytics.py /app/analytics.py
27
  COPY automation_engine.py /app/automation_engine.py
28
  COPY services /app/services
 
23
 
24
  # Copy only runtime sources to reduce invalidation surface.
25
  COPY main.py /app/main.py
26
+ COPY startup_validation.py /app/startup_validation.py
27
  COPY analytics.py /app/analytics.py
28
  COPY automation_engine.py /app/automation_engine.py
29
  COPY services /app/services
config/env.sample CHANGED
@@ -38,8 +38,8 @@ INFERENCE_INTERACTIVE_TIMEOUT_SEC=55
38
  INFERENCE_BACKGROUND_TIMEOUT_SEC=120
39
 
40
  # model defaults
41
- # Leave empty unless you intentionally want one global model for every task.
42
- INFERENCE_MODEL_ID=
43
  INFERENCE_MAX_NEW_TOKENS=640
44
  INFERENCE_TEMPERATURE=0.2
45
  INFERENCE_TOP_P=0.9
@@ -54,7 +54,7 @@ CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC=25
54
  CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
55
  # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
56
  HF_QUIZ_MODEL_ID=
57
- HF_QUIZ_JSON_REPAIR_MODEL_ID=meta-llama/Llama-3.1-8B-Instruct
58
 
59
  # retry behavior
60
  INFERENCE_MAX_RETRIES=3
 
38
  INFERENCE_BACKGROUND_TIMEOUT_SEC=120
39
 
40
  # model defaults
41
+ # Global default model for all tasks.
42
+ INFERENCE_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
43
  INFERENCE_MAX_NEW_TOKENS=640
44
  INFERENCE_TEMPERATURE=0.2
45
  INFERENCE_TOP_P=0.9
 
54
  CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
55
  # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
56
  HF_QUIZ_MODEL_ID=
57
+ HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
58
 
59
  # retry behavior
60
  INFERENCE_MAX_RETRIES=3
config/models.yaml CHANGED
@@ -1,7 +1,7 @@
1
  models:
2
  primary:
3
- id: meta-llama/Llama-3.1-8B-Instruct
4
- description: Fast default instruction model for interactive Grade 11-12 math tutoring
5
  max_new_tokens: 640
6
  temperature: 0.25
7
  top_p: 0.9
@@ -26,24 +26,23 @@ models:
26
 
27
  routing:
28
  task_model_map:
29
- # Chat default: Llama-3.1-8B for low latency.
30
- # Hard prompts can escalate to 70B via runtime policy in inference_client.
31
- chat: meta-llama/Llama-3.1-8B-Instruct
32
- verify_solution: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
33
- lesson_generation: meta-llama/Llama-3.1-8B-Instruct
34
- quiz_generation: meta-llama/Llama-3.1-8B-Instruct
35
- learning_path: meta-llama/Llama-3.1-8B-Instruct
36
- daily_insight: meta-llama/Llama-3.1-8B-Instruct
37
- risk_classification: meta-llama/Llama-3.1-8B-Instruct
38
- risk_narrative: meta-llama/Llama-3.1-8B-Instruct
39
 
40
  task_fallback_model_map:
41
  chat:
42
  - meta-llama/Meta-Llama-3-70B-Instruct # Hard/fallback quality tier
43
  - google/gemma-2-2b-it # Fast safety fallback
44
  verify_solution:
45
- - NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO # Primary (reasoning-focused)
46
- - meta-llama/Meta-Llama-3-70B-Instruct # First fallback
47
  - meta-llama/Llama-3.1-8B-Instruct # Second fallback
48
 
49
  task_provider_map:
 
1
  models:
2
  primary:
3
+ id: Qwen/Qwen2.5-7B-Instruct
4
+ description: Global default instruction model for interactive Grade 11-12 math tutoring
5
  max_new_tokens: 640
6
  temperature: 0.25
7
  top_p: 0.9
 
26
 
27
  routing:
28
  task_model_map:
29
+ # Global default: Qwen2.5-7B across all tasks.
30
+ # Hard prompts can still escalate via runtime policy in inference_client.
31
+ chat: Qwen/Qwen2.5-7B-Instruct
32
+ verify_solution: Qwen/Qwen2.5-7B-Instruct
33
+ lesson_generation: Qwen/Qwen2.5-7B-Instruct
34
+ quiz_generation: Qwen/Qwen2.5-7B-Instruct
35
+ learning_path: Qwen/Qwen2.5-7B-Instruct
36
+ daily_insight: Qwen/Qwen2.5-7B-Instruct
37
+ risk_classification: Qwen/Qwen2.5-7B-Instruct
38
+ risk_narrative: Qwen/Qwen2.5-7B-Instruct
39
 
40
  task_fallback_model_map:
41
  chat:
42
  - meta-llama/Meta-Llama-3-70B-Instruct # Hard/fallback quality tier
43
  - google/gemma-2-2b-it # Fast safety fallback
44
  verify_solution:
45
+ - meta-llama/Meta-Llama-3-70B-Instruct # Higher-capacity fallback
 
46
  - meta-llama/Llama-3.1-8B-Instruct # Second fallback
47
 
48
  task_provider_map:
services/inference_client.py CHANGED
@@ -107,7 +107,7 @@ class InferenceClient:
107
  self.pro_route_header_name = os.getenv("INFERENCE_PRO_ROUTE_HEADER_NAME", "")
108
  self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
109
 
110
- default_model_fallback = str(primary.get("id") or "meta-llama/Llama-3.1-8B-Instruct")
111
  env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
112
  self.default_model = env_model_id or default_model_fallback
113
 
@@ -177,16 +177,16 @@ class InferenceClient:
177
  int(os.getenv("INFERENCE_INTERACTIVE_MAX_FALLBACK_DEPTH", "1")),
178
  )
179
 
180
- # Default task-to-model routing (chat defaults to fast 8B, with hard-prompt escalation to 70B)
181
  self.task_model_map: Dict[str, str] = {
182
- "chat": "meta-llama/Llama-3.1-8B-Instruct",
183
  "verify_solution": "Qwen/Qwen2.5-7B-Instruct",
184
  "lesson_generation": "Qwen/Qwen2.5-7B-Instruct",
185
  "quiz_generation": "Qwen/Qwen2.5-7B-Instruct",
186
  "learning_path": "Qwen/Qwen2.5-7B-Instruct",
187
- "daily_insight": self.default_model,
188
- "risk_classification": self.default_model,
189
- "risk_narrative": self.default_model,
190
  }
191
  # Fallback chains (only to other HF-supported models, no featherless-ai)
192
  self.task_fallback_model_map: Dict[str, List[str]] = {
 
107
  self.pro_route_header_name = os.getenv("INFERENCE_PRO_ROUTE_HEADER_NAME", "")
108
  self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
109
 
110
+ default_model_fallback = str(primary.get("id") or "Qwen/Qwen2.5-7B-Instruct")
111
  env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
112
  self.default_model = env_model_id or default_model_fallback
113
 
 
177
  int(os.getenv("INFERENCE_INTERACTIVE_MAX_FALLBACK_DEPTH", "1")),
178
  )
179
 
180
+ # Default task-to-model routing (global default set to Qwen2.5-7B)
181
  self.task_model_map: Dict[str, str] = {
182
+ "chat": "Qwen/Qwen2.5-7B-Instruct",
183
  "verify_solution": "Qwen/Qwen2.5-7B-Instruct",
184
  "lesson_generation": "Qwen/Qwen2.5-7B-Instruct",
185
  "quiz_generation": "Qwen/Qwen2.5-7B-Instruct",
186
  "learning_path": "Qwen/Qwen2.5-7B-Instruct",
187
+ "daily_insight": "Qwen/Qwen2.5-7B-Instruct",
188
+ "risk_classification": "Qwen/Qwen2.5-7B-Instruct",
189
+ "risk_narrative": "Qwen/Qwen2.5-7B-Instruct",
190
  }
191
  # Fallback chains (only to other HF-supported models, no featherless-ai)
192
  self.task_fallback_model_map: Dict[str, List[str]] = {