Spaces:
Sleeping
Sleeping
| from utils import restart_exec, use_huggingface_cli_to_download_model | |
| import os | |
| from pathlib import Path | |
| import subprocess | |
| if __name__ == "__main__": | |
| print("*** START ***") | |
| current_script_folder = os.path.dirname(os.path.realpath(__file__)) | |
| # HuggingFace persistent storage | |
| models_path = "/data" if Path("/data").is_dir() else current_script_folder | |
| models = [ | |
| "Qwen2.5-7B-Instruct-1M-q4_k_m-Original", | |
| "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned", | |
| ] | |
| # Check if available locally, otherwise, download. | |
| for model in models: | |
| model_path = f"{models_path}/{model}.gguf" | |
| if not Path(model_path).is_file(): | |
| print( | |
| f"Model [{model}] not found at [{model_path}]. Downloading...", | |
| flush=True, | |
| ) | |
| use_huggingface_cli_to_download_model(model, models_path) | |
| if not Path(model_path).is_file(): | |
| raise Exception( | |
| f"Already called use_huggingface_cli_to_download_model(). But model [{model}] not found at [{model_path}]." | |
| ) | |
| llama_server_types = ["bin_gpu", "bin_cpu"] | |
| for idx, model_name in reversed( | |
| list(enumerate(models)) | |
| ): # Prioritize loading the Finetuned model. | |
| for server_type in llama_server_types: | |
| llama_server_path = f'{current_script_folder}/binaries/{server_type}/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --parallel 1 --seed -1 --jinja --api-key "{os.environ["LOCAL_LLM_API_KEY"]}" --no-webui --host 0.0.0.0 --port 808{idx} --mlock --log-timestamps --log-colors --alias "{model_name}" -m "{models_path}/{model_name}.gguf" --threads {os.cpu_count()}' | |
| if restart_exec(llama_server_path, "server is listening on"): | |
| break | |
| print("*** END ***") | |