Promotion Space deploy (2026-03-23 12:55 UTC)
Browse files- app.py +96 -2
- requirements.txt +1 -2
app.py
CHANGED
|
@@ -123,6 +123,12 @@ DEFAULT_GATE_MIN_PASS_AT_K = max(0.0, _safe_float(TEMPLATE_QUALITY_GATE.get("min
|
|
| 123 |
DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
|
| 124 |
DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
|
| 125 |
DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
PROJECT_DESCRIPTION = """
|
|
@@ -1759,6 +1765,15 @@ def stream_subprocess(
|
|
| 1759 |
return ret
|
| 1760 |
|
| 1761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1762 |
def make_copyable_textbox(
|
| 1763 |
label: str,
|
| 1764 |
lines: int,
|
|
@@ -1832,6 +1847,7 @@ def run_pipeline_core(
|
|
| 1832 |
base_model_id = (base_model_id or "").strip()
|
| 1833 |
if not base_model_id:
|
| 1834 |
raise ValueError("Base model is required.")
|
|
|
|
| 1835 |
|
| 1836 |
stage_start = int(start_stage)
|
| 1837 |
stage_count = int(max_stages)
|
|
@@ -1882,6 +1898,7 @@ def run_pipeline_core(
|
|
| 1882 |
"dataset_repo_id": dataset_repo_id,
|
| 1883 |
"model_repo_id": model_repo_id,
|
| 1884 |
"base_model_id": base_model_id,
|
|
|
|
| 1885 |
"autonomous_mode": bool(autonomous_mode),
|
| 1886 |
"start_stage": stage_start,
|
| 1887 |
"max_stages": stage_count,
|
|
@@ -1950,7 +1967,7 @@ def run_pipeline_core(
|
|
| 1950 |
raise RuntimeError("Run cancelled by user.")
|
| 1951 |
|
| 1952 |
runtime_cfg = write_runtime_config(
|
| 1953 |
-
base_model_id=
|
| 1954 |
model_repo_id=model_repo_id,
|
| 1955 |
train_file=train_file,
|
| 1956 |
validation_file=validation_file,
|
|
@@ -2019,6 +2036,83 @@ def run_pipeline_core(
|
|
| 2019 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 2020 |
return
|
| 2021 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2022 |
if train_ret != 0:
|
| 2023 |
summary["result"] = "failed"
|
| 2024 |
summary["failure_stage"] = "training"
|
|
@@ -2082,7 +2176,7 @@ def run_pipeline_core(
|
|
| 2082 |
"--config",
|
| 2083 |
str(runtime_cfg),
|
| 2084 |
"--base-model",
|
| 2085 |
-
|
| 2086 |
"--adapter-path",
|
| 2087 |
str(TRAIN_OUTPUT_DIR / "final_adapter"),
|
| 2088 |
"--eval-file",
|
|
|
|
| 123 |
DEFAULT_AUTO_EVAL_K = max(1, _safe_int(TEMPLATE_POST_EVAL.get("k"), 4))
|
| 124 |
DEFAULT_AUTO_EVAL_SAMPLES = max(1, _safe_int(TEMPLATE_POST_EVAL.get("max_samples"), 300))
|
| 125 |
DEFAULT_AUTO_PUSH_TO_HUB = bool(TEMPLATE_HUB.get("push_to_hub", True))
|
| 126 |
+
DEFAULT_MODEL_ARCH_FALLBACK = (os.environ.get("MODEL_ARCH_FALLBACK_BASE_MODEL") or "Qwen/Qwen2.5-0.5B-Instruct").strip()
|
| 127 |
+
MODEL_ARCH_ERROR_MARKERS = (
|
| 128 |
+
"does not recognize this architecture",
|
| 129 |
+
"KeyError: 'deepseek_v32'",
|
| 130 |
+
"model type `deepseek_v32`",
|
| 131 |
+
)
|
| 132 |
|
| 133 |
|
| 134 |
PROJECT_DESCRIPTION = """
|
|
|
|
| 1765 |
return ret
|
| 1766 |
|
| 1767 |
|
| 1768 |
+
def has_unrecognized_model_arch_error(log_lines: List[str]) -> bool:
|
| 1769 |
+
if not log_lines:
|
| 1770 |
+
return False
|
| 1771 |
+
# Scan a bounded tail to keep checks cheap while still catching tracebacks.
|
| 1772 |
+
tail = "\n".join(log_lines[-400:])
|
| 1773 |
+
folded = tail.casefold()
|
| 1774 |
+
return any(marker.casefold() in folded for marker in MODEL_ARCH_ERROR_MARKERS)
|
| 1775 |
+
|
| 1776 |
+
|
| 1777 |
def make_copyable_textbox(
|
| 1778 |
label: str,
|
| 1779 |
lines: int,
|
|
|
|
| 1847 |
base_model_id = (base_model_id or "").strip()
|
| 1848 |
if not base_model_id:
|
| 1849 |
raise ValueError("Base model is required.")
|
| 1850 |
+
effective_base_model_id = base_model_id
|
| 1851 |
|
| 1852 |
stage_start = int(start_stage)
|
| 1853 |
stage_count = int(max_stages)
|
|
|
|
| 1898 |
"dataset_repo_id": dataset_repo_id,
|
| 1899 |
"model_repo_id": model_repo_id,
|
| 1900 |
"base_model_id": base_model_id,
|
| 1901 |
+
"base_model_id_effective": effective_base_model_id,
|
| 1902 |
"autonomous_mode": bool(autonomous_mode),
|
| 1903 |
"start_stage": stage_start,
|
| 1904 |
"max_stages": stage_count,
|
|
|
|
| 1967 |
raise RuntimeError("Run cancelled by user.")
|
| 1968 |
|
| 1969 |
runtime_cfg = write_runtime_config(
|
| 1970 |
+
base_model_id=effective_base_model_id,
|
| 1971 |
model_repo_id=model_repo_id,
|
| 1972 |
train_file=train_file,
|
| 1973 |
validation_file=validation_file,
|
|
|
|
| 2036 |
yield "\n".join(log_lines), "Cancelled", summary_text(summary)
|
| 2037 |
return
|
| 2038 |
|
| 2039 |
+
if train_ret != 0:
|
| 2040 |
+
fallback_model_id = DEFAULT_MODEL_ARCH_FALLBACK
|
| 2041 |
+
should_try_fallback = (
|
| 2042 |
+
not preflight_only
|
| 2043 |
+
and bool(fallback_model_id)
|
| 2044 |
+
and fallback_model_id != effective_base_model_id
|
| 2045 |
+
and has_unrecognized_model_arch_error(log_lines)
|
| 2046 |
+
)
|
| 2047 |
+
if should_try_fallback:
|
| 2048 |
+
append_log(
|
| 2049 |
+
log_lines,
|
| 2050 |
+
f"Detected unsupported model architecture for {effective_base_model_id}. "
|
| 2051 |
+
f"Retrying with fallback model {fallback_model_id}.",
|
| 2052 |
+
)
|
| 2053 |
+
summary["fallback"] = {
|
| 2054 |
+
"trigger": "unsupported_model_architecture",
|
| 2055 |
+
"from_model": effective_base_model_id,
|
| 2056 |
+
"to_model": fallback_model_id,
|
| 2057 |
+
}
|
| 2058 |
+
effective_base_model_id = fallback_model_id
|
| 2059 |
+
summary["base_model_id_effective"] = effective_base_model_id
|
| 2060 |
+
yield "\n".join(log_lines), "Retrying with fallback model", summary_text(summary)
|
| 2061 |
+
|
| 2062 |
+
runtime_cfg = write_runtime_config(
|
| 2063 |
+
base_model_id=effective_base_model_id,
|
| 2064 |
+
model_repo_id=model_repo_id,
|
| 2065 |
+
train_file=train_file,
|
| 2066 |
+
validation_file=validation_file,
|
| 2067 |
+
test_file=test_file,
|
| 2068 |
+
run_eval=bool(run_eval),
|
| 2069 |
+
eval_k=eval_k,
|
| 2070 |
+
eval_samples=eval_samples,
|
| 2071 |
+
push_to_hub=effective_push_to_hub,
|
| 2072 |
+
enforce_quality_gate=bool(enforce_quality_gate),
|
| 2073 |
+
gate_min_pass_at_1=gate_min_pass_at_1,
|
| 2074 |
+
gate_min_pass_at_k=gate_min_pass_at_k,
|
| 2075 |
+
gate_min_rows=gate_min_rows,
|
| 2076 |
+
)
|
| 2077 |
+
summary["runtime_config"] = str(runtime_cfg)
|
| 2078 |
+
append_log(log_lines, f"Wrote fallback runtime config: {runtime_cfg}")
|
| 2079 |
+
yield "\n".join(log_lines), "Fallback config ready", summary_text(summary)
|
| 2080 |
+
|
| 2081 |
+
fallback_cmd = [
|
| 2082 |
+
sys.executable,
|
| 2083 |
+
str(TRAIN_SCRIPT),
|
| 2084 |
+
"--config",
|
| 2085 |
+
str(runtime_cfg),
|
| 2086 |
+
"--start-stage",
|
| 2087 |
+
str(stage_start),
|
| 2088 |
+
"--max-stages",
|
| 2089 |
+
str(stage_count),
|
| 2090 |
+
]
|
| 2091 |
+
if preflight_only:
|
| 2092 |
+
fallback_cmd.append("--dry-run")
|
| 2093 |
+
|
| 2094 |
+
fallback_gen = stream_subprocess(
|
| 2095 |
+
cmd=fallback_cmd,
|
| 2096 |
+
env=env,
|
| 2097 |
+
cwd=ROOT,
|
| 2098 |
+
log_lines=log_lines,
|
| 2099 |
+
status_prefix="Training (fallback)",
|
| 2100 |
+
)
|
| 2101 |
+
while True:
|
| 2102 |
+
try:
|
| 2103 |
+
logs_text, status_text = next(fallback_gen)
|
| 2104 |
+
summary["status"] = status_text
|
| 2105 |
+
yield logs_text, status_text, summary_text(summary)
|
| 2106 |
+
except StopIteration as stop:
|
| 2107 |
+
train_ret = stop.value
|
| 2108 |
+
break
|
| 2109 |
+
|
| 2110 |
+
if isinstance(summary.get("fallback"), dict):
|
| 2111 |
+
summary["fallback"]["result"] = "ok" if train_ret == 0 else "failed"
|
| 2112 |
+
if train_ret == 0:
|
| 2113 |
+
append_log(log_lines, f"Fallback model run succeeded with {effective_base_model_id}.")
|
| 2114 |
+
yield "\n".join(log_lines), "Fallback succeeded", summary_text(summary)
|
| 2115 |
+
|
| 2116 |
if train_ret != 0:
|
| 2117 |
summary["result"] = "failed"
|
| 2118 |
summary["failure_stage"] = "training"
|
|
|
|
| 2176 |
"--config",
|
| 2177 |
str(runtime_cfg),
|
| 2178 |
"--base-model",
|
| 2179 |
+
effective_base_model_id,
|
| 2180 |
"--adapter-path",
|
| 2181 |
str(TRAIN_OUTPUT_DIR / "final_adapter"),
|
| 2182 |
"--eval-file",
|
requirements.txt
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
gradio>=6.6.0,<7
|
| 2 |
torch>=2.3.0,<3
|
| 3 |
-
|
| 4 |
-
transformers @ git+https://github.com/huggingface/transformers.git@main
|
| 5 |
accelerate>=1.1.0,<2
|
| 6 |
datasets>=2.21.0,<3
|
| 7 |
peft>=0.14.0,<1
|
|
|
|
| 1 |
gradio>=6.6.0,<7
|
| 2 |
torch>=2.3.0,<3
|
| 3 |
+
transformers>=4.48.0,<5
|
|
|
|
| 4 |
accelerate>=1.1.0,<2
|
| 5 |
datasets>=2.21.0,<3
|
| 6 |
peft>=0.14.0,<1
|