from utils import restart_exec, use_huggingface_cli_to_download_model
import os
from pathlib import Path

import subprocess

if __name__ == "__main__":
    print("*** START ***")

    current_script_folder = os.path.dirname(os.path.realpath(__file__))

    # HuggingFace persistent storage
    models_path = "/data" if Path("/data").is_dir() else current_script_folder

    models = [
        "Qwen2.5-7B-Instruct-1M-q4_k_m-Original",
        "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned",
    ]

    # Check if available locally, otherwise, download.
    for model in models:
        model_path = f"{models_path}/{model}.gguf"
        if not Path(model_path).is_file():
            print(
                f"Model [{model}] not found at [{model_path}]. Downloading...",
                flush=True,
            )
            use_huggingface_cli_to_download_model(model, models_path)
            if not Path(model_path).is_file():
                raise Exception(
                    f"Already called use_huggingface_cli_to_download_model(). But model [{model}] not found at [{model_path}]."
                )

    llama_server_types = ["bin_gpu", "bin_cpu"]

    for idx, model_name in reversed(
        list(enumerate(models))
    ):  # Prioritize loading the Finetuned model.
        for server_type in llama_server_types:
            llama_server_path = f'{current_script_folder}/binaries/{server_type}/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --parallel 1 --seed -1 --jinja --api-key "{os.environ["LOCAL_LLM_API_KEY"]}" --no-webui --host 0.0.0.0 --port 808{idx} --mlock --log-timestamps --log-colors --alias "{model_name}" -m "{models_path}/{model_name}.gguf" --threads {os.cpu_count()}'

            if restart_exec(llama_server_path, "server is listening on"):
                break

    print("*** END ***")