finetuned-llm-demo-app / run_local_llm_server.py
tnt306's picture
Use HuggingFace persisent storage
c7f1f5a
from utils import restart_exec, use_huggingface_cli_to_download_model
import os
from pathlib import Path
import subprocess
if __name__ == "__main__":
print("*** START ***")
current_script_folder = os.path.dirname(os.path.realpath(__file__))
# HuggingFace persistent storage
models_path = "/data" if Path("/data").is_dir() else current_script_folder
models = [
"Qwen2.5-7B-Instruct-1M-q4_k_m-Original",
"Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned",
]
# Check if available locally, otherwise, download.
for model in models:
model_path = f"{models_path}/{model}.gguf"
if not Path(model_path).is_file():
print(
f"Model [{model}] not found at [{model_path}]. Downloading...",
flush=True,
)
use_huggingface_cli_to_download_model(model, models_path)
if not Path(model_path).is_file():
raise Exception(
f"Already called use_huggingface_cli_to_download_model(). But model [{model}] not found at [{model_path}]."
)
llama_server_types = ["bin_gpu", "bin_cpu"]
for idx, model_name in reversed(
list(enumerate(models))
): # Prioritize loading the Finetuned model.
for server_type in llama_server_types:
llama_server_path = f'{current_script_folder}/binaries/{server_type}/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --parallel 1 --seed -1 --jinja --api-key "{os.environ["LOCAL_LLM_API_KEY"]}" --no-webui --host 0.0.0.0 --port 808{idx} --mlock --log-timestamps --log-colors --alias "{model_name}" -m "{models_path}/{model_name}.gguf" --threads {os.cpu_count()}'
if restart_exec(llama_server_path, "server is listening on"):
break
print("*** END ***")