NorthernTribe-Research commited on
Commit
281037f
·
verified ·
1 Parent(s): f3e3451

Promotion Space deploy (2026-03-23 12:55 UTC)

Browse files
Files changed (2) hide show
  1. app.py +96 -2
  2. requirements.txt +1 -2
app.py CHANGED
@@ -123,6 +123,12 @@ DEFAULT_GATE_MIN_PASS_AT_K = max(0.0, _safe_float(TEMPLATE_QUALITY_GATE.get("min
123
  DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
124
  DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
125
  DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
 
 
 
 
 
 
126
 
127
 
128
  PROJECT_DESCRIPTION = """
@@ -1759,6 +1765,15 @@ def stream_subprocess(
1759
  return ret
1760
 
1761
 
 
 
 
 
 
 
 
 
 
1762
  def make_copyable_textbox(
1763
  label: str,
1764
  lines: int,
@@ -1832,6 +1847,7 @@ def run_pipeline_core(
1832
  base_model_id = (base_model_id or "").strip()
1833
  if not base_model_id:
1834
  raise ValueError("Base model is required.")
 
1835
 
1836
  stage_start = int(start_stage)
1837
  stage_count = int(max_stages)
@@ -1882,6 +1898,7 @@ def run_pipeline_core(
1882
  "dataset_repo_id": dataset_repo_id,
1883
  "model_repo_id": model_repo_id,
1884
  "base_model_id": base_model_id,
 
1885
  "autonomous_mode": bool(autonomous_mode),
1886
  "start_stage": stage_start,
1887
  "max_stages": stage_count,
@@ -1950,7 +1967,7 @@ def run_pipeline_core(
1950
  raise RuntimeError("Run cancelled by user.")
1951
 
1952
  runtime_cfg = write_runtime_config(
1953
- base_model_id=base_model_id,
1954
  model_repo_id=model_repo_id,
1955
  train_file=train_file,
1956
  validation_file=validation_file,
@@ -2019,6 +2036,83 @@ def run_pipeline_core(
2019
  yield "\n".join(log_lines), "Cancelled", summary_text(summary)
2020
  return
2021
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2022
  if train_ret != 0:
2023
  summary["result"] = "failed"
2024
  summary["failure_stage"] = "training"
@@ -2082,7 +2176,7 @@ def run_pipeline_core(
2082
  "--config",
2083
  str(runtime_cfg),
2084
  "--base-model",
2085
- base_model_id,
2086
  "--adapter-path",
2087
  str(TRAIN_OUTPUT_DIR / "final_adapter"),
2088
  "--eval-file",
 
123
  DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
124
  DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
125
  DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
126
+ DEFAULT_MODEL_ARCH_FALLBACK = (os.environ.get("MODEL_ARCH_FALLBACK_BASE_MODEL") or "Qwen/Qwen2.5-0.5B-Instruct").strip()
127
+ MODEL_ARCH_ERROR_MARKERS = (
128
+ "does not recognize this architecture",
129
+ "KeyError: 'deepseek_v32'",
130
+ "model type `deepseek_v32`",
131
+ )
132
 
133
 
134
  PROJECT_DESCRIPTION = """
 
1765
  return ret
1766
 
1767
 
1768
+ def has_unrecognized_model_arch_error(log_lines: List[str]) -> bool:
1769
+ if not log_lines:
1770
+ return False
1771
+ # Scan a bounded tail to keep checks cheap while still catching tracebacks.
1772
+ tail = "\n".join(log_lines[-400:])
1773
+ folded = tail.casefold()
1774
+ return any(marker.casefold() in folded for marker in MODEL_ARCH_ERROR_MARKERS)
1775
+
1776
+
1777
  def make_copyable_textbox(
1778
  label: str,
1779
  lines: int,
 
1847
  base_model_id = (base_model_id or "").strip()
1848
  if not base_model_id:
1849
  raise ValueError("Base model is required.")
1850
+ effective_base_model_id = base_model_id
1851
 
1852
  stage_start = int(start_stage)
1853
  stage_count = int(max_stages)
 
1898
  "dataset_repo_id": dataset_repo_id,
1899
  "model_repo_id": model_repo_id,
1900
  "base_model_id": base_model_id,
1901
+ "base_model_id_effective": effective_base_model_id,
1902
  "autonomous_mode": bool(autonomous_mode),
1903
  "start_stage": stage_start,
1904
  "max_stages": stage_count,
 
1967
  raise RuntimeError("Run cancelled by user.")
1968
 
1969
  runtime_cfg = write_runtime_config(
1970
+ base_model_id=effective_base_model_id,
1971
  model_repo_id=model_repo_id,
1972
  train_file=train_file,
1973
  validation_file=validation_file,
 
2036
  yield "\n".join(log_lines), "Cancelled", summary_text(summary)
2037
  return
2038
 
2039
+ if train_ret != 0:
2040
+ fallback_model_id = DEFAULT_MODEL_ARCH_FALLBACK
2041
+ should_try_fallback = (
2042
+ not preflight_only
2043
+ and bool(fallback_model_id)
2044
+ and fallback_model_id != effective_base_model_id
2045
+ and has_unrecognized_model_arch_error(log_lines)
2046
+ )
2047
+ if should_try_fallback:
2048
+ append_log(
2049
+ log_lines,
2050
+ f"Detected unsupported model architecture for {effective_base_model_id}. "
2051
+ f"Retrying with fallback model {fallback_model_id}.",
2052
+ )
2053
+ summary["fallback"] = {
2054
+ "trigger": "unsupported_model_architecture",
2055
+ "from_model": effective_base_model_id,
2056
+ "to_model": fallback_model_id,
2057
+ }
2058
+ effective_base_model_id = fallback_model_id
2059
+ summary["base_model_id_effective"] = effective_base_model_id
2060
+ yield "\n".join(log_lines), "Retrying with fallback model", summary_text(summary)
2061
+
2062
+ runtime_cfg = write_runtime_config(
2063
+ base_model_id=effective_base_model_id,
2064
+ model_repo_id=model_repo_id,
2065
+ train_file=train_file,
2066
+ validation_file=validation_file,
2067
+ test_file=test_file,
2068
+ run_eval=bool(run_eval),
2069
+ eval_k=eval_k,
2070
+ eval_samples=eval_samples,
2071
+ push_to_hub=effective_push_to_hub,
2072
+ enforce_quality_gate=bool(enforce_quality_gate),
2073
+ gate_min_pass_at_1=gate_min_pass_at_1,
2074
+ gate_min_pass_at_k=gate_min_pass_at_k,
2075
+ gate_min_rows=gate_min_rows,
2076
+ )
2077
+ summary["runtime_config"] = str(runtime_cfg)
2078
+ append_log(log_lines, f"Wrote fallback runtime config: {runtime_cfg}")
2079
+ yield "\n".join(log_lines), "Fallback config ready", summary_text(summary)
2080
+
2081
+ fallback_cmd = [
2082
+ sys.executable,
2083
+ str(TRAIN_SCRIPT),
2084
+ "--config",
2085
+ str(runtime_cfg),
2086
+ "--start-stage",
2087
+ str(stage_start),
2088
+ "--max-stages",
2089
+ str(stage_count),
2090
+ ]
2091
+ if preflight_only:
2092
+ fallback_cmd.append("--dry-run")
2093
+
2094
+ fallback_gen = stream_subprocess(
2095
+ cmd=fallback_cmd,
2096
+ env=env,
2097
+ cwd=ROOT,
2098
+ log_lines=log_lines,
2099
+ status_prefix="Training (fallback)",
2100
+ )
2101
+ while True:
2102
+ try:
2103
+ logs_text, status_text = next(fallback_gen)
2104
+ summary["status"] = status_text
2105
+ yield logs_text, status_text, summary_text(summary)
2106
+ except StopIteration as stop:
2107
+ train_ret = stop.value
2108
+ break
2109
+
2110
+ if isinstance(summary.get("fallback"), dict):
2111
+ summary["fallback"]["result"] = "ok" if train_ret == 0 else "failed"
2112
+ if train_ret == 0:
2113
+ append_log(log_lines, f"Fallback model run succeeded with {effective_base_model_id}.")
2114
+ yield "\n".join(log_lines), "Fallback succeeded", summary_text(summary)
2115
+
2116
  if train_ret != 0:
2117
  summary["result"] = "failed"
2118
  summary["failure_stage"] = "training"
 
2176
  "--config",
2177
  str(runtime_cfg),
2178
  "--base-model",
2179
+ effective_base_model_id,
2180
  "--adapter-path",
2181
  str(TRAIN_OUTPUT_DIR / "final_adapter"),
2182
  "--eval-file",
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
  gradio>=6.6.0,<7
2
  torch>=2.3.0,<3
3
- # Track main for newest model architectures like deepseek_v32 in Spaces.
4
- transformers @ git+https://github.com/huggingface/transformers.git@main
5
  accelerate>=1.1.0,<2
6
  datasets>=2.21.0,<3
7
  peft>=0.14.0,<1
 
1
  gradio>=6.6.0,<7
2
  torch>=2.3.0,<3
3
+ transformers>=4.48.0,<5
 
4
  accelerate>=1.1.0,<2
5
  datasets>=2.21.0,<3
6
  peft>=0.14.0,<1