import os from llama_cpp import Llama import gradio as gr from huggingface_hub import hf_hub_download model_file = hf_hub_download(repo_id="NousResearch/Nous-Hermes-2-Mistral-7B-DPO", filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M") # Load quantized GGUF model #MODEL_PATH = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF/Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf" llm = Llama( model_path=model_file, n_ctx=4096, n_threads=8, # Lower for free Spaces; raise for accelerated GPUs n_gpu_layers=35, # Depending on GPU quota verbose=False ) # Define the response function def chat(prompt): output = llm( prompt, max_tokens=512, temperature=0.7, stop=["User:", "AI:"], ) return output["choices"][0]["text"] # Hide UI, expose only API iface = gr.Interface(fn=chat, inputs="text", outputs="text").queue() iface.launch(server_name="0.0.0.0",server_port=7860)