import os
from llama_cpp import Llama
import gradio as gr
from huggingface_hub import hf_hub_download

model_file = hf_hub_download(repo_id="NousResearch/Nous-Hermes-2-Mistral-7B-DPO", filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M")

# Load quantized GGUF model
#MODEL_PATH = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF/Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf"

llm = Llama(
    model_path=model_file,
    n_ctx=4096,
    n_threads=8,   # Lower for free Spaces; raise for accelerated GPUs
    n_gpu_layers=35,  # Depending on GPU quota
    verbose=False
)

# Define the response function
def chat(prompt):
    output = llm(
        prompt,
        max_tokens=512,
        temperature=0.7,
        stop=["User:", "AI:"],
    )
    return output["choices"][0]["text"]

# Hide UI, expose only API
iface = gr.Interface(fn=chat, inputs="text", outputs="text").queue()

iface.launch(server_name="0.0.0.0",server_port=7860)