from utils import restart_exec, use_huggingface_cli_to_download_model import os from pathlib import Path import subprocess if __name__ == "__main__": print("*** START ***") current_script_folder = os.path.dirname(os.path.realpath(__file__)) # HuggingFace persistent storage models_path = "/data" if Path("/data").is_dir() else current_script_folder models = [ "Qwen2.5-7B-Instruct-1M-q4_k_m-Original", "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned", ] # Check if available locally, otherwise, download. for model in models: model_path = f"{models_path}/{model}.gguf" if not Path(model_path).is_file(): print( f"Model [{model}] not found at [{model_path}]. Downloading...", flush=True, ) use_huggingface_cli_to_download_model(model, models_path) if not Path(model_path).is_file(): raise Exception( f"Already called use_huggingface_cli_to_download_model(). But model [{model}] not found at [{model_path}]." ) llama_server_types = ["bin_gpu", "bin_cpu"] for idx, model_name in reversed( list(enumerate(models)) ): # Prioritize loading the Finetuned model. for server_type in llama_server_types: llama_server_path = f'{current_script_folder}/binaries/{server_type}/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --parallel 1 --seed -1 --jinja --api-key "{os.environ["LOCAL_LLM_API_KEY"]}" --no-webui --host 0.0.0.0 --port 808{idx} --mlock --log-timestamps --log-colors --alias "{model_name}" -m "{models_path}/{model_name}.gguf" --threads {os.cpu_count()}' if restart_exec(llama_server_path, "server is listening on"): break print("*** END ***")