Spaces:

tnt306
/

finetuned-llm-demo-app

Sleeping

App Files Files Community

tnt306 commited on Apr 10

Commit

b77991a

1 Parent(s): 545de03

Initial Version (Qwen2.5-7B-Instruct-1M-q4_k_m Original & Finetuned; Chat Web UI)

Browse files

Files changed (10) hide show

.gitattributes +3 -0
Dockerfile +44 -0
build_docker.sh +2 -0
prompt_parsing.py +211 -0
run_docker.sh +1 -0
run_local_llm_server.py +19 -0
server_ui.py +170 -0
test.py +11 -0
total_run.sh +3 -0
utils.py +89 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+binaries/bin/llama-server filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04
+RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
+RUN wget https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh -O ~/Anaconda3-2024.10-1-Linux-x86_64.sh
+RUN bash ~/Anaconda3-2024.10-1-Linux-x86_64.sh -b -p /root/anaconda3
+ENV PATH="/root/anaconda3/bin:$PATH"
+RUN rm ~/Anaconda3-2024.10-1-Linux-x86_64.sh
+RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1
+WORKDIR /LLM-App
+COPY binaries ./binaries
+COPY Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned.gguf .
+COPY Qwen2.5-7B-Instruct-1M-q4_k_m-Original.gguf .
+COPY run_local_llm_server.py .
+RUN chmod +x ./binaries/bin/llama-server
+WORKDIR /root/anaconda3
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+WORKDIR /LLM-App
+RUN pip3 install langchain-core
+RUN pip3 install pydantic-ai
+RUN pip3 install langgraph
+RUN pip3 install langchain-google-genai
+RUN pip3 install langchain-openai
+RUN apt-get -y install graphviz graphviz-dev
+RUN apt-get -y install gcc
+RUN pip3 install pygraphviz
+COPY utils.py .
+COPY test.py .
+COPY prompt_parsing.py .
+COPY server_ui.py .
+COPY total_run.sh .
+RUN chmod +x total_run.sh
+CMD ["/usr/bin/bash", "./total_run.sh"]

build_docker.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ docker build -t finetuned-llm-demo-app-huggingface-docker .
2	+ docker rmi -f $(sudo docker images -f "dangling=true" -q)

prompt_parsing.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from langchain_core.prompts import ChatPromptTemplate
+from typing import Dict, List, Literal
+from langchain_openai import ChatOpenAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from os import getenv
+from time import sleep
+from enum import Enum
+import functools
+from langgraph.graph import StateGraph, START, END
+from langgraph.checkpoint.memory import MemorySaver
+from typing import TypedDict, Annotated, List, Any
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIModel
+from pydantic_ai.models.gemini import GeminiModel
+from pydantic_ai.messages import ModelMessage, ModelMessagesTypeAdapter
+from langgraph.types import StreamWriter
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.providers.google_gla import GoogleGLAProvider
+from langchain_core.runnables.config import RunnableConfig
+class ModelNames(Enum):
+    Qwen25_7B_Instruct_1M_q4_k_m_Finetuned = "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned"
+    Qwen25_7B_Instruct_1M_q4_k_m_Original = "Qwen2.5-7B-Instruct-1M-q4_k_m-Original"
+class LLMWaitTime(Enum):
+    """
+    OpenRouter allows 20 requests per minute, 200 requests per day for free tier, AKA. 3 seconds per request. (https://openrouter.ai/docs/api-reference/limits)
+    Gemini 2.0 Flash: RPM: 15; RPD: 1,500 ➔ AKA. 4 seconds per request. (https://ai.google.dev/gemini-api/docs/rate-limits#free-tier)
+    """
+    OpenRouter_DeepSeek_R1 = 3
+    OpenRouter_Qwen25_72B_Instruct = 3
+    OpenRouter_Llama33_70B_Instruct = 3
+    Google_Gemini_20_Flash = 4
+LOCAL_LLM_URL = "http://127.0.0.1"
+# LOCAL_LLM_URL = "http://192.168.178.45"
+LOCAL_LLM_API = "36b81180-08d2-4b73-826c-3203e0698c06"
+prompt_arxiv_qa = ChatPromptTemplate(
+    [
+        ("system", "You are a helpful Research bot."),
+        (
+            "human",
+            'Below is the title and abstract of a paper from arXiv. Create {num_questions} pairs of questions and corresponding answers, based on the title and abstract. Avoid using abbreviations and acronyms. Questions start with the string "Question:". Answers start with the string "Answer:". Include only the list and nothing else.\n\nTitle: {title}\n\nAbstract: {abstract}',
+        ),
+    ]
+)
+prompt_arxiv_summary = ChatPromptTemplate(
+    [
+        ("system", "You are a helpful Research bot."),
+        (
+            "human",
+            "Below is the title and abstract of a paper from arXiv. Summarize it, and additionally include other relevant information to help users understand the paper better.\n\nTitle: {title}\n\nAbstract: {abstract}",
+        ),
+    ]
+)
+prompt_paraphrase = ChatPromptTemplate(
+    [
+        ("system", "You are a helpful Research bot. {further_instruction}"),
+        ("human", "Paraphrase the following {thing} below:\n\n{thing}:{sentence}"),
+    ]
+)
+# oneshot_deepseek_llm = ChatOpenAI(
+#     openai_api_key=getenv("OPENROUTER_API_KEY"),
+#     openai_api_base="https://openrouter.ai/api/v1",
+#     model_name="deepseek/deepseek-r1:free",
+# )
+# oneshot_gemini_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
+# oneshot_qwen_llm = ChatOpenAI(
+#     openai_api_key=getenv("OPENROUTER_API_KEY"),
+#     openai_api_base="https://openrouter.ai/api/v1",
+#     model_name="qwen/qwen-2.5-72b-instruct:free",
+# )
+# oneshot_llama_llm = ChatOpenAI(
+#     openai_api_key=getenv("OPENROUTER_API_KEY2"),
+#     openai_api_base="https://openrouter.ai/api/v1",
+#     model_name="meta-llama/llama-3.3-70b-instruct:free",
+# )
+def parse_arxiv_qa_prompt_output(output: str) -> List[Dict]:
+    lines = output.split("\n")
+    lst_qa = []
+    question = ""
+    answer = ""
+    for line in lines:
+        line = line.strip()
+        if len(line) > 0:
+            if line.startswith("Question:"):
+                question = line[line.index(" ") + 1 :]
+            elif line.startswith("Answer:"):
+                answer = line[line.index(" ") + 1 :]
+                lst_qa.append({"question": question, "answer": answer})
+                question = ""
+                answer = ""
+            else:
+                print(f"Error: [{line}] not question nor answer")
+    return lst_qa
+def llm_wait_after_request(provider: LLMWaitTime):
+    def decorator(some_function):
+        @functools.wraps(some_function)
+        def wrapper(*args, **kwargs):
+            res = some_function(*args, **kwargs)
+            sleep(provider.value)
+            return res
+        return wrapper
+    return decorator
+########################################################
+# Define state schema
+class AgentState(TypedDict):
+    latest_user_message: str
+    messages: Annotated[List[bytes], lambda x, y: x + y]
+reasoner_system_prompt = 'You are a helpful Artificial Intelligence (AI) Research bot, with expertise on Large Language Model (LLM). You have especially deep knowledge about the Research Paper "Byte Latent Transformer (BLT): Patches Scale Better Than Tokens". Users can ask you questions, and you will provide the corresponding answers. If the questions are related to Byte Latent Transformer (BLT), the answers must be in a detailed manner, and primarily come from the information in the Research Paper, additionally with your general knowledge. The goal is to help users understand fully.'
+reasoner_agents = {
+    ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value: Agent(
+        OpenAIModel(
+            ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value,
+            provider=OpenAIProvider(
+                api_key=LOCAL_LLM_API,
+                base_url=f"{LOCAL_LLM_URL}:8081/v1",
+            ),
+        ),
+        system_prompt=reasoner_system_prompt,
+        retries=3,
+    ),
+    ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Original.value: Agent(
+        OpenAIModel(
+            ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Original.value,
+            provider=OpenAIProvider(
+                api_key=LOCAL_LLM_API,
+                base_url=f"{LOCAL_LLM_URL}:8080/v1",
+            ),
+        ),
+        system_prompt=reasoner_system_prompt,
+        retries=3,
+    ),
+}
+# router_agent = Agent(
+#     GeminiModel(
+#         "gemini-2.0-flash",
+#         provider=GoogleGLAProvider(api_key=os.getenv("GOOGLE_API_KEY")),
+#     ),
+#     system_prompt="Your job is to route the user message either to the end of the conversation or to continue the conversation.",
+#     retries=3,
+# )
+async def reasoner(state: AgentState, writer: StreamWriter, config: RunnableConfig):
+    print(f"reasoner(): latest_user_message = {state["latest_user_message"]}")
+    model = config["configurable"]["model"]  # type: ignore
+    reasoner_agent = reasoner_agents[model]
+    print(f"reasoner(): chosen model = {model}")
+    # Get the message history into the format for Pydantic AI
+    message_history: list[ModelMessage] = []
+    for message_row in state["messages"]:
+        message_history.extend(ModelMessagesTypeAdapter.validate_json(message_row))
+    async with reasoner_agent.run_stream(
+        state["latest_user_message"], message_history=message_history
+    ) as result:
+        async for chunk in result.stream_text(delta=True):
+            writer(chunk)
+    """MyNote:
+    The "new_messages_json" includes the latest user message and the AI's response.
+    If first time, it will include the system prompt as well.
+    """
+    return {"messages": [result.new_messages_json()]}
+def generate_agentic_flow():
+    builder = StateGraph(AgentState)
+    # Add nodes
+    builder.add_node("reasoner", reasoner)
+    # Set edges
+    builder.add_edge(START, "reasoner")
+    builder.add_edge("reasoner", END)
+    # Maintain memory across different graph runs. ➔ Must also use "thread_id" in RunnableConfig/"configurable".
+    memory = MemorySaver()
+    agentic_flow = builder.compile(checkpointer=memory)
+    # For debug
+    agentic_flow.get_graph().draw_png("graph.png")
+    return agentic_flow

run_docker.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ docker run --rm -p 8501:8501 --gpus all finetuned-llm-demo-app-huggingface-docker

run_local_llm_server.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from utils import restart_exec
+import os
+if __name__ == "__main__":
+    print("*** START ***")
+    current_script_folder = os.path.dirname(os.path.realpath(__file__))
+    models = [
+        "Qwen2.5-7B-Instruct-1M-q4_k_m-Original",
+        "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned",
+    ]
+    for idx, model_name in enumerate(models):
+        llama_server_path = f'{current_script_folder}/binaries/bin/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --seed -1 --api-key 36b81180-08d2-4b73-826c-3203e0698c06 --no-webui --host 0.0.0.0 --port 808{idx} --log-timestamps --log-colors --alias "{model_name}" -m "{current_script_folder}/{model_name}.gguf"'
+        restart_exec(llama_server_path, "server is listening on")
+    print("*** END ***")

server_ui.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import streamlit as st
+import asyncio
+import uuid
+from utils import get_current_time
+from prompt_parsing import generate_agentic_flow, ModelNames
+from langchain_core.runnables import RunnableConfig
+import pandas as pd
+import json
+import ast
+# Cache that return global resources (e.g. database connections, ML models).
+# Cached objects are shared across all users, sessions, and reruns.
+@st.cache_resource
+def get_global_server_session_id():
+    return str(uuid.uuid4())
+# Prepare dât unique to this current session.
+def initialize_session_if_not(force=False):
+    if force or "local_session_id" not in st.session_state:
+        st.session_state.local_session_id = str(uuid.uuid4())
+    if force or "local_agentic_flow" not in st.session_state:
+        st.session_state.local_agentic_flow = generate_agentic_flow()
+    if force or "local_config" not in st.session_state:
+        current_chosen_model = (
+            st.session_state.get("local_config", {})
+            .get("configurable", {})
+            .get("model")
+        )
+        to_assign_model = (
+            ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value
+            if current_chosen_model is None
+            else current_chosen_model
+        )
+        # The memory thread_id for this local session.
+        st.session_state.local_config = RunnableConfig(
+            configurable={
+                "thread_id": st.session_state.local_session_id,
+                "model": to_assign_model,
+            }
+        )  # The memory thread_id for this local session.
+    if force or "messages" not in st.session_state:
+        st.session_state.messages = []
+async def run_agent_with_streaming(user_input: str):
+    # The "config" parameter allows Graph with different configurations (system message or LLM to use).
+    async for msg in st.session_state.local_agentic_flow.astream(
+        {"latest_user_message": user_input},
+        st.session_state.local_config,
+        stream_mode="custom",
+    ):
+        yield msg
+def reset_conversation():
+    initialize_session_if_not(True)
+def choose_model():
+    st.session_state.local_config["configurable"][
+        "model"
+    ] = st.session_state.chosen_model
+@st.dialog("st.session_state:")
+def display_session_state():
+    ss_str = str(st.session_state)
+    st.write(ss_str)
+async def main():
+    st.set_page_config(page_title="Finetuned LLM Demo", layout="wide")
+    st.title("Finetuned LLM Demo")
+    st.markdown(
+        """
+        <style>
+            header {background-color: transparent !important;}
+            .block-container {
+                padding-top: 0px;
+            }
+            .reportview-container {
+                margin-top: -2em;
+            }
+            #MainMenu {visibility: hidden;}
+            .stDeployButton {display:none;}
+            footer {visibility: hidden;}
+            #stDecoration {display:none;}
+        </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    with st.expander("System", expanded=False):
+        df = pd.DataFrame.from_records(
+            [
+                {
+                    "Global Server Session ID": get_global_server_session_id(),
+                    "Local Session ID": st.session_state.local_session_id,
+                    "Page Refreshed at": get_current_time(),
+                }
+            ]
+        )
+        st.table(df)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.selectbox(
+                label="Model:",
+                options=tuple([e.value for e in ModelNames]),
+                index=0,
+                key="chosen_model",
+                on_change=choose_model,
+                label_visibility="collapsed",
+            )
+        with col2:
+            st.button(label="st.session_state", on_click=display_session_state)
+    st.write(
+        'Ask me about LLM in particular and AI in general. BTW, I\'m an expert on "Byte Latent Transformer". Example: What are the benefits of Byte Latent Transformer?'
+    )
+    st.button("New Chat", on_click=reset_conversation)
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        message_type = message["type"]
+        if message_type in ["human", "ai", "system"]:
+            with st.chat_message(message_type):
+                st.markdown(message["content"])
+    # Chat input for the user
+    user_input = st.chat_input("What question is in your mind right now?")
+    if user_input:
+        # We append a new request to the conversation explicitly
+        st.session_state.messages.append({"type": "human", "content": user_input})
+        # Display user prompt in the UI
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        # Display assistant response in chat message container
+        response_content = ""
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()  # Placeholder for updating the message
+            # Run the async generator to fetch responses
+            async for chunk in run_agent_with_streaming(user_input):
+                response_content += chunk
+                # Update the placeholder with the current response content
+                message_placeholder.markdown(response_content)
+        st.session_state.messages.append({"type": "ai", "content": response_content})
+if __name__ == "__main__":
+    print("*** START ***")
+    try:
+        initialize_session_if_not(False)
+        asyncio.run(main())
+    except Exception as ex:
+        print("*** ERROR ***")
+        print(ex)
+    print("*** END ***")

test.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import torch
+import sys
+print(f"sys.version: {sys.version}")
+print(f"torch.__version__: {torch.__version__}")
+print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
+print(f"torch.cuda.device_count(): {torch.cuda.device_count()}")
+print(f"torch.cuda.current_device(): {torch.cuda.current_device()}")
+print(f"torch.cuda.device(0): {torch.cuda.device(0)}")
+print(f"torch.cuda.get_device_name(0): {torch.cuda.get_device_name(0)}")
+print(f"torch.version.cuda: {torch.version.cuda}")

total_run.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/bash
+python run_local_llm_server.py
+streamlit run server_ui.py --server.port 7860

utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from datetime import datetime, timezone
+import socket
+import shlex
+import subprocess
+import os
+import psutil
+def get_current_time():
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S")
+def check_server(address, port):
+    # Create a TCP socket
+    s = socket.socket()
+    print(f"Attempting to connect to {address} on port {port}")
+    try:
+        s.connect((address, port))
+        print(f"Connected to {address} on port {port}")
+        return True
+    except socket.error as e:
+        print(f"Connection to {address} on port {port} failed:{e}")
+        return False
+    finally:
+        s.close()
+###############################################################################
+invalid_chars = frozenset('<>:"/\\|?*')
+def get_valid_filename(filename: str):
+    return "".join("-" if c in invalid_chars else c for c in filename)
+###############################################################################
+# Example usage: restart_exec(llama_server_path, "server is listening on")
+def restart_exec(shell_exec_path_with_args, success_text):
+    exec_path = shlex.split(shell_exec_path_with_args)[
+        0
+    ]  # supposed to have absolute path!
+    exec_name = os.path.basename(exec_path)
+    exec_folder = os.path.dirname(exec_path)
+    # Kill all existing exec processes.
+    for proc in psutil.process_iter():
+        # check whether the process name matches
+        if proc.name() == exec_name:
+            proc.kill()
+    # Start a new llama-server process.
+    exec_proc = subprocess.Popen(
+        shell_exec_path_with_args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        shell=True,
+        cwd=f"{exec_folder}",
+    )
+    os.set_blocking(exec_proc.stdout.fileno(), False)
+    started = False
+    while exec_proc.poll() is None:
+        output = exec_proc.stdout.readline()
+        error = exec_proc.stderr.readline()
+        current_line = (
+            curr_line_output
+            if output and len(curr_line_output := output.strip()) > 0
+            else ""
+        ) + (
+            curr_line_error
+            if error and len(curr_line_error := error.strip()) > 0
+            else ""
+        )
+        if len(current_line) > 0:
+            # print(f"Current Line: {current_line}")
+            if success_text in current_line:
+                print(
+                    f"➔ Found the success text [{success_text}] at [{get_current_time()}]."
+                )
+                started = True
+                break
+    if not started:
+        print(f"Failed to start the {exec_name}. returncode: {exec_proc.returncode}")
+    return started