Spaces:

NorthernTribe-Research
/

math_trainer

Running

App Files Files Community

NorthernTribe-Research commited on 11 days ago

Commit

281037f

verified ·

1 Parent(s): f3e3451

Promotion Space deploy (2026-03-23 12:55 UTC)

Browse files

Files changed (2) hide show

app.py +96 -2
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -123,6 +123,12 @@ DEFAULT_GATE_MIN_PASS_AT_K = max(0.0, _safe_float(TEMPLATE_QUALITY_GATE.get("min
 DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
 DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
 DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
 PROJECT_DESCRIPTION = """
@@ -1759,6 +1765,15 @@ def stream_subprocess(
     return ret
 def make_copyable_textbox(
     label: str,
     lines: int,
@@ -1832,6 +1847,7 @@ def run_pipeline_core(
         base_model_id = (base_model_id or "").strip()
         if not base_model_id:
             raise ValueError("Base model is required.")
         stage_start = int(start_stage)
         stage_count = int(max_stages)
@@ -1882,6 +1898,7 @@ def run_pipeline_core(
                 "dataset_repo_id": dataset_repo_id,
                 "model_repo_id": model_repo_id,
                 "base_model_id": base_model_id,
                 "autonomous_mode": bool(autonomous_mode),
                 "start_stage": stage_start,
                 "max_stages": stage_count,
@@ -1950,7 +1967,7 @@ def run_pipeline_core(
             raise RuntimeError("Run cancelled by user.")
         runtime_cfg = write_runtime_config(
-            base_model_id=base_model_id,
             model_repo_id=model_repo_id,
             train_file=train_file,
             validation_file=validation_file,
@@ -2019,6 +2036,83 @@ def run_pipeline_core(
             yield "\n".join(log_lines), "Cancelled", summary_text(summary)
             return
         if train_ret != 0:
             summary["result"] = "failed"
             summary["failure_stage"] = "training"
@@ -2082,7 +2176,7 @@ def run_pipeline_core(
                 "--config",
                 str(runtime_cfg),
                 "--base-model",
-                base_model_id,
                 "--adapter-path",
                 str(TRAIN_OUTPUT_DIR / "final_adapter"),
                 "--eval-file",

 DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
 DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
 DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
+DEFAULT_MODEL_ARCH_FALLBACK = (os.environ.get("MODEL_ARCH_FALLBACK_BASE_MODEL") or "Qwen/Qwen2.5-0.5B-Instruct").strip()
+MODEL_ARCH_ERROR_MARKERS = (
+    "does not recognize this architecture",
+    "KeyError: 'deepseek_v32'",
+    "model type `deepseek_v32`",
+)
 PROJECT_DESCRIPTION = """
     return ret
+def has_unrecognized_model_arch_error(log_lines: List[str]) -> bool:
+    if not log_lines:
+        return False
+    # Scan a bounded tail to keep checks cheap while still catching tracebacks.
+    tail = "\n".join(log_lines[-400:])
+    folded = tail.casefold()
+    return any(marker.casefold() in folded for marker in MODEL_ARCH_ERROR_MARKERS)
 def make_copyable_textbox(
     label: str,
     lines: int,
         base_model_id = (base_model_id or "").strip()
         if not base_model_id:
             raise ValueError("Base model is required.")
+        effective_base_model_id = base_model_id
         stage_start = int(start_stage)
         stage_count = int(max_stages)
                 "dataset_repo_id": dataset_repo_id,
                 "model_repo_id": model_repo_id,
                 "base_model_id": base_model_id,
+                "base_model_id_effective": effective_base_model_id,
                 "autonomous_mode": bool(autonomous_mode),
                 "start_stage": stage_start,
                 "max_stages": stage_count,
             raise RuntimeError("Run cancelled by user.")
         runtime_cfg = write_runtime_config(
+            base_model_id=effective_base_model_id,
             model_repo_id=model_repo_id,
             train_file=train_file,
             validation_file=validation_file,
             yield "\n".join(log_lines), "Cancelled", summary_text(summary)
             return
+        if train_ret != 0:
+            fallback_model_id = DEFAULT_MODEL_ARCH_FALLBACK
+            should_try_fallback = (
+                not preflight_only
+                and bool(fallback_model_id)
+                and fallback_model_id != effective_base_model_id
+                and has_unrecognized_model_arch_error(log_lines)
+            )
+            if should_try_fallback:
+                append_log(
+                    log_lines,
+                    f"Detected unsupported model architecture for {effective_base_model_id}. "
+                    f"Retrying with fallback model {fallback_model_id}.",
+                )
+                summary["fallback"] = {
+                    "trigger": "unsupported_model_architecture",
+                    "from_model": effective_base_model_id,
+                    "to_model": fallback_model_id,
+                }
+                effective_base_model_id = fallback_model_id
+                summary["base_model_id_effective"] = effective_base_model_id
+                yield "\n".join(log_lines), "Retrying with fallback model", summary_text(summary)
+                runtime_cfg = write_runtime_config(
+                    base_model_id=effective_base_model_id,
+                    model_repo_id=model_repo_id,
+                    train_file=train_file,
+                    validation_file=validation_file,
+                    test_file=test_file,
+                    run_eval=bool(run_eval),
+                    eval_k=eval_k,
+                    eval_samples=eval_samples,
+                    push_to_hub=effective_push_to_hub,
+                    enforce_quality_gate=bool(enforce_quality_gate),
+                    gate_min_pass_at_1=gate_min_pass_at_1,
+                    gate_min_pass_at_k=gate_min_pass_at_k,
+                    gate_min_rows=gate_min_rows,
+                )
+                summary["runtime_config"] = str(runtime_cfg)
+                append_log(log_lines, f"Wrote fallback runtime config: {runtime_cfg}")
+                yield "\n".join(log_lines), "Fallback config ready", summary_text(summary)
+                fallback_cmd = [
+                    sys.executable,
+                    str(TRAIN_SCRIPT),
+                    "--config",
+                    str(runtime_cfg),
+                    "--start-stage",
+                    str(stage_start),
+                    "--max-stages",
+                    str(stage_count),
+                ]
+                if preflight_only:
+                    fallback_cmd.append("--dry-run")
+                fallback_gen = stream_subprocess(
+                    cmd=fallback_cmd,
+                    env=env,
+                    cwd=ROOT,
+                    log_lines=log_lines,
+                    status_prefix="Training (fallback)",
+                )
+                while True:
+                    try:
+                        logs_text, status_text = next(fallback_gen)
+                        summary["status"] = status_text
+                        yield logs_text, status_text, summary_text(summary)
+                    except StopIteration as stop:
+                        train_ret = stop.value
+                        break
+                if isinstance(summary.get("fallback"), dict):
+                    summary["fallback"]["result"] = "ok" if train_ret == 0 else "failed"
+                if train_ret == 0:
+                    append_log(log_lines, f"Fallback model run succeeded with {effective_base_model_id}.")
+                    yield "\n".join(log_lines), "Fallback succeeded", summary_text(summary)
         if train_ret != 0:
             summary["result"] = "failed"
             summary["failure_stage"] = "training"
                 "--config",
                 str(runtime_cfg),
                 "--base-model",
+                effective_base_model_id,
                 "--adapter-path",
                 str(TRAIN_OUTPUT_DIR / "final_adapter"),
                 "--eval-file",

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 gradio>=6.6.0,<7
 torch>=2.3.0,<3
-# Track main for newest model architectures like deepseek_v32 in Spaces.
-transformers @ git+https://github.com/huggingface/transformers.git@main
 accelerate>=1.1.0,<2
 datasets>=2.21.0,<3
 peft>=0.14.0,<1

 gradio>=6.6.0,<7
 torch>=2.3.0,<3
+transformers>=4.48.0,<5
 accelerate>=1.1.0,<2
 datasets>=2.21.0,<3
 peft>=0.14.0,<1