Spaces:

tnt306
/

finetuned-llm-demo-app

Sleeping

App Files Files Community

finetuned-llm-demo-app / run_local_llm_server.py

tnt306

Use HuggingFace persisent storage

c7f1f5a 8 months ago

raw

history blame contribute delete

2.03 kB

	from utils import restart_exec, use_huggingface_cli_to_download_model
	import os
	from pathlib import Path

	import subprocess

	if __name__ == "__main__":
	print("* START *")

	current_script_folder = os.path.dirname(os.path.realpath(__file__))

	# HuggingFace persistent storage
	models_path = "/data" if Path("/data").is_dir() else current_script_folder

	models = [
	"Qwen2.5-7B-Instruct-1M-q4_k_m-Original",
	"Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned",
	]

	# Check if available locally, otherwise, download.
	for model in models:
	model_path = f"{models_path}/{model}.gguf"
	if not Path(model_path).is_file():
	print(
	f"Model [{model}] not found at [{model_path}]. Downloading...",
	flush=True,
	)
	use_huggingface_cli_to_download_model(model, models_path)
	if not Path(model_path).is_file():
	raise Exception(
	f"Already called use_huggingface_cli_to_download_model(). But model [{model}] not found at [{model_path}]."
	)

	llama_server_types = ["bin_gpu", "bin_cpu"]

	for idx, model_name in reversed(
	list(enumerate(models))
	): # Prioritize loading the Finetuned model.
	for server_type in llama_server_types:
	llama_server_path = f'{current_script_folder}/binaries/{server_type}/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --parallel 1 --seed -1 --jinja --api-key "{os.environ["LOCAL_LLM_API_KEY"]}" --no-webui --host 0.0.0.0 --port 808{idx} --mlock --log-timestamps --log-colors --alias "{model_name}" -m "{models_path}/{model_name}.gguf" --threads {os.cpu_count()}'

	if restart_exec(llama_server_path, "server is listening on"):
	break

	print("* END *")