Spaces:

large-traversaal
/

Alif-1.0-8B-Instruct

Sleeping

App Files Files Community

alishafique commited on Feb 25

Commit

f88a5b9

verified ·

1 Parent(s): a2356f0

Upload 2 files

Browse files

Files changed (2) hide show

Dockerfile (2) +56 -0
app (3).py +108 -0

Dockerfile (2) ADDED Viewed

	@@ -0,0 +1,56 @@

+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+COPY . .
+# setting build related env
+# ENV CUDA_DOCKER_ARCH=all
+# ENV LLAMA_CUBLAS=1
+RUN nvcc --version && python3 --version
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake \
+    scikit-build setuptools fastapi uvicorn sse-starlette \
+    pydantic-settings starlette-context gradio huggingface_hub hf_transfer
+# Install llama-cpp-python (build with cuda)
+# RUN CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=75" FORCE_CMAKE=1 python3 -m pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
+# RUN python3 -m pip install llama-cpp-python
+#RUN python3 -m pip install llama-cpp-python \
+#  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+RUN pip install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python3", "app.py"]

app (3).py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import json
+import subprocess
+import gradio as gr
+from threading import Thread
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from datetime import datetime
+# Load model from Hugging Face Hub
+MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
+MODEL_FILE = "model-Q8_0.gguf"
+model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)
+# Initialize Llama model
+llama = Llama(
+    model_path=model_path_file,
+    n_gpu_layers=40,  # Adjust based on VRAM
+    n_threads=8,  # Match CPU cores
+    n_batch=512,  # Optimize for better VRAM usage
+    n_ctx=4096,  # Context window size
+    verbose=True  # Enable debug logging
+)
+# Function to generate responses
+def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+    # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
+    chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
+    response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
+    text = ""
+    for chunk in response:
+        content = chunk["choices"][0]["text"]
+        if content:
+            text += content
+            yield text
+# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+#     """Generates a streaming response from the Llama model."""
+#     messages = [
+#         {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
+#     ]
+#     # Add history and the current message
+#     #for user, bot in history:
+#         #messages.append({"role": "user", "content": user})
+#         #messages.append({"role": "assistant", "content": bot})
+#     messages.append({"role": "user", "content": message})
+#     response = llama.create_chat_completion(
+#         messages=messages,
+#         stream=True,
+#     )
+#     partial_message = ""
+#     for part in response:
+#         content = part["choices"][0]["delta"].get("content", "")
+#         partial_message += content
+#         yield partial_message
+# JavaScript function for `on_load`
+on_load = """
+async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
+"""
+placeholder = """
+<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
+</center>
+"""
+# Create custom chat UI using `gr.Blocks`
+with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
+    with gr.Column(scale=1, elem_id="center-content"):
+        gr.Markdown(
+            """
+            <div style="text-align: center;">
+                <h1>Alif 1.0 Urdu & English Chatbot 🚀</h1>
+                <p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
+            </div>
+            """,
+        )
+    chat = gr.ChatInterface(
+        generate_response,
+        #chatbot=gr.Chatbot(placeholder=placeholder),
+        #title="🚀" + " " + "Alif-1.0 Chatbot",
+        #description="Urdu AI Chatbot powered by Llama.cpp",
+        examples=[
+            ["شہر کراچی کے بارے میں بتاؤ"],
+            ["قابل تجدید توانائی کیا ہے؟"],
+            ["پاکستان کے بارے میں بتائیں"]
+        ],
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu.", label="System prompt", render=False),
+            gr.Slider(0, 1, 0.8, label="Temperature", render=False),
+            gr.Slider(128, 4096, 512, label="Max new tokens", render=False),
+            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
+            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
+            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
+        ],
+    )
+demo.queue(max_size=10).launch(share=True)