Spaces:

baidu
/

ERNIE-4.5-VL-28B-A3B-Thinking

Running

File size: 12,278 Bytes

cf5d8cf
5ea0a9a
72ae5ae
 
 
 
cf5d8cf
72ae5ae
 
cf5d8cf
72ae5ae
d34c38d
f4cbb8a
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
d34c38d
72ae5ae
 
761812f
72ae5ae
 
 
 
 
cf5d8cf
0bec3bf
 
 
 
 
 
 
 
 
cf5d8cf
72ae5ae
 
cf5d8cf
72ae5ae
 
cf5d8cf
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
0bec3bf
72ae5ae
 
 
cf5d8cf
72ae5ae
 
 
cf5d8cf
72ae5ae
 
 
 
 
 
0bec3bf
b8aff8d
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
 
 
 
cf5d8cf
72ae5ae
 
bbe2eb3
72ae5ae
 
cf5d8cf
5ea0a9a
 
 
d34c38d
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
72ae5ae
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
 
 
 
 
 
5ea0a9a
 
 
 
 
 
 
0bec3bf
 
 
 
 
 
 
5ea0a9a
0bec3bf
 
 
 
5ea0a9a
0bec3bf
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
 
 
 
cf5d8cf
 
 
 
64e01cd

import base64
import html
import mimetypes
import os
from pathlib import Path
from typing import Any, Dict, List

import gradio as gr
from openai import OpenAI

DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "ERNIE-4.5-VL-28B-A3B-Thinking")
BASE_URL = os.getenv("BASE_URL","")
api_key = os.getenv("ERNIE_API_KEY","")


CUSTOM_CSS = """
body {
    background: radial-gradient(circle at top, #fdfbff 0%, #e7ecf7 45%, #dfe6f5 100%);
    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
    color: #0f172a;
}
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto;
}
#ernie-hero {
    padding: 12px 0 4px;
}
#ernie-hero h1 {
    font-size: 1.85rem;
    margin-bottom: 0;
    font-weight: 500;
}
#model-link {
    margin-top: 6px;
    font-size: 0.95rem;
}
#model-link a {
    color: #4c1d95;
    text-decoration: none;
    font-weight: 500;
}
#model-link a:hover {
    text-decoration: underline;
}
#examples-panel {
    margin-top: 20px;
    padding: 18px 22px;
    border-radius: 18px;
    border: 1px solid rgba(15, 23, 42, 0.12);
    background: rgba(255, 255, 255, 0.92);
    box-shadow: 0 15px 35px rgba(15, 23, 42, 0.08);
    gap: 18px;
}
#examples-panel h4 {
    margin: 0 0 8px;
    font-size: 1.1rem;
    font-weight: 500;
}
#examples-panel p {
    margin: 0;
    color: rgba(15, 23, 42, 0.7);
    font-size: 0.95rem;
}
#examples-grid table {
    width: 100%;
}
#examples-grid table tbody {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
    gap: 12px;
}
#examples-grid table tr {
    display: block;
    background: #f7f9ff;
    border-radius: 14px;
    border: 1px solid rgba(15, 23, 42, 0.08);
    padding: 14px;
    box-shadow: 0 10px 28px rgba(15, 23, 42, 0.08);
}
#examples-grid table td {
    display: block;
    padding: 0;
}
#chat-wrapper {
    margin-top: 32px;
    border-radius: 24px;
    padding: 18px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(15, 23, 42, 0.1);
    box-shadow: 0 25px 60px rgba(15, 23, 42, 0.12);
}
.ernie-section {
    border-radius: 18px;
    margin-bottom: 14px;
    padding: 16px 18px;
    border: 1px solid rgba(15, 23, 42, 0.1);
    background: rgba(255, 255, 255, 0.95);
    box-shadow: 0 10px 24px rgba(15, 23, 42, 0.08);
}
.ernie-section-header {
    font-size: 0.85rem;
    text-transform: uppercase;
    letter-spacing: 0.08em;
    font-weight: 600;
    color: rgba(15, 23, 42, 0.65);
    display: flex;
    align-items: center;
    gap: 6px;
}
.ernie-section-body {
    margin-top: 10px;
    font-size: 1rem;
    color: rgba(15, 23, 42, 0.92);
    white-space: pre-wrap;
    line-height: 1.55;
}
.ernie-thinking {
    border-color: rgba(79, 70, 229, 0.35);
    background: rgba(129, 140, 248, 0.08);
}
.ernie-answer {
    border-color: rgba(16, 185, 129, 0.35);
    background: rgba(110, 231, 183, 0.08);
}

@media (prefers-color-scheme: dark) {
    body {
        background: radial-gradient(circle at top, #1f264b 0%, #0f172a 45%, #040713 100%);
        color: #ecf2ff;
    }
    #model-link a {
        color: #a5b4fc;
    }
    #examples-panel {
        border: 1px solid rgba(255, 255, 255, 0.05);
        background: rgba(8, 13, 30, 0.85);
        box-shadow: 0 15px 45px rgba(3, 7, 18, 0.55);
    }
    #examples-panel p {
        color: rgba(236, 242, 255, 0.75);
    }
    #examples-grid table tr {
        background: rgba(15, 23, 42, 0.7);
        border: 1px solid rgba(255, 255, 255, 0.04);
        box-shadow: 0 10px 30px rgba(4, 6, 15, 0.45);
    }
    #chat-wrapper {
        background: rgba(2, 6, 23, 0.78);
        border: 1px solid rgba(99, 102, 241, 0.25);
        box-shadow: 0 25px 70px rgba(2, 6, 23, 0.7);
    }
    .ernie-section {
        border: 1px solid rgba(255, 255, 255, 0.08);
        background: rgba(15, 23, 42, 0.85);
        box-shadow: 0 10px 30px rgba(2, 6, 23, 0.55);
    }
    .ernie-section-header {
        color: rgba(236, 242, 255, 0.75);
    }
    .ernie-section-body {
        color: rgba(248, 250, 255, 0.95);
    }
    .ernie-answer {
        border-color: rgba(45, 212, 191, 0.45);
        background: rgba(8, 47, 56, 0.65);
    }
    .ernie-thinking {
        border-color: rgba(165, 180, 252, 0.4);
        background: rgba(30, 27, 75, 0.65);
    }
}
"""

_client = OpenAI(
    base_url=BASE_URL,
    api_key=api_key,
)

def _data_url(path: str) -> str:
    mime, _ = mimetypes.guess_type(path)
    mime = mime or "application/octet-stream"
    data = base64.b64encode(Path(path).read_bytes()).decode("utf-8")
    return f"data:{mime};base64,{data}"

def _media_content(path: str) -> Dict[str, Any]:
    """支持图片和视频"""
    mime, _ = mimetypes.guess_type(path)
    if mime and mime.startswith("video"):
        # 视频格式
        return {"type": "video_url", "video_url": {"url": _data_url(path)}}
    else:
        # 图片格式（默认）
        return {"type": "image_url", "image_url": {"url": _data_url(path)}}

def _text_content(text: str) -> Dict[str, Any]:
    return {"type": "text", "text": text}

def _message(role: str, content: Any) -> Dict[str, Any]:
    return {"role": role, "content": content}

def _format_sections(thinking: str, answer: str | None = None) -> str:
    """Render Thinking/Answer blocks with HTML so the chatbot can style them."""
    def _build_block(kind: str, label: str, text: str, icon: str) -> str:
        text = (text or "").strip()
        if not text:
            return ""
        escaped = html.escape(text)
        return (
            f'<div class="ernie-section ernie-{kind}">'
            f'<div class="ernie-section-header">{icon} {label}</div>'
            f'<div class="ernie-section-body">{escaped}</div>'
            "</div>"
        )

    sections = [
        _build_block("thinking", "Thinking", thinking, "🧠"),
        _build_block("answer", "Answer", answer, "✨") if answer is not None else "",
    ]
    rendered = "".join(section for section in sections if section)
    return rendered

def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]:
    files = message.get("files") or []
    text = (message.get("text") or "").strip()
    content: List[Dict[str, Any]] = [_media_content(p) for p in files]
    if text:
        content.append(_text_content(text))
    return _message("user", content)

def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    msgs: List[Dict[str, Any]] = []
    user_content: List[Dict[str, Any]] = []
    
    for turn in history or []:
        role, content = turn.get("role"), turn.get("content")
        if role == "user":
            if isinstance(content, str):
                user_content.append(_text_content(content))
            elif isinstance(content, tuple):
                user_content.extend(_media_content(path) for path in content if path)
        elif role == "assistant":
            if "Answer:\n" in content:
                # 分割并仅保留Answer部分
                answer_only = content.split("Answer:\n", 1)[1].strip()
            else:
                # 兼容没有Thinking的情况
                answer_only = content.strip()

            if user_content:
                msgs.append(_message("user", user_content.copy()))
                user_content.clear()

            msgs.append(_message("assistant", [{"type": "text", "text": answer_only}]))
            
    return msgs


def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL):
    messages = _convert_history(history)
    messages.append(_build_user_message(message))
  
    try:
        stream = _client.chat.completions.create(
            model="default",
            messages=messages,
            stream=True
        )
        thinking_parts: List[str] = []
        answer_parts: List[str] = []
        answer_started = False

        for chunk in stream:
            delta = chunk.choices[0].delta

            if getattr(delta, "reasoning_content", None):
                thinking_parts.append(delta.reasoning_content)

            if getattr(delta, "content", None):
                answer_started = True
                answer_parts.append(delta.content)

            thinking_text = "".join(thinking_parts)
            answer_text = "".join(answer_parts) if answer_parts else None

            if answer_started:
                rendered = _format_sections(thinking_text, answer_text)
            else:
                rendered = _format_sections(thinking_text)

            if rendered:
                yield rendered

        if not answer_started and thinking_parts:
            # 流结束但模型未返回Answer时，至少保证Thinking被展示完全
            rendered = _format_sections("".join(thinking_parts))
            if rendered:
                yield rendered
    except Exception as e:
        yield f"Failed to get response: {e}"

def run_example(message: Dict[str, Any], history: List[Dict[str, Any]] | None = None):
    """
    用于 Examples 点击时直接走大模型。
    - 输入还是 ChatInterface 那种 message dict：{"text": ..., "files": [...]}
    - history 是 Chatbot 当前的消息列表（type="messages"）
    - 输出改成 Chatbot 需要的消息列表：[{role, content}, ...]
    """
    history = history or []

    # 直接复用你现有的流式函数，只是把它返回的 HTML 包一层 messages
    for rendered in stream_response(message, history):
        # 这里只简单把 user 文本展示出来；图片就当“上下文里有了”，不专门渲染
        user_text = (message.get("text") or "").strip() or "[Example]"
        display_history = history + [
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": rendered},
        ]
        # 关键：对 Chatbot 来说，返回值要是「完整的消息列表」
        yield display_history

def build_demo() -> gr.Blocks:
    theme = gr.themes.Soft(primary_hue="violet", secondary_hue="cyan", neutral_hue="slate")

    with gr.Blocks(
        title="ERNIE-4.5-VL-28B-A3B-Thinking",
        theme=theme,
        css=CUSTOM_CSS,
    ) as demo:
        with gr.Column(elem_id="ernie-hero"):
            gr.Markdown(
                """
                <h1>Chat with ERNIE-4.5-VL-28B-A3B-Thinking</h1>
                """,
                elem_id="hero-text",
            )
            gr.Markdown(
                """
                <p id="model-link">
                    Model Repository:
                    <a href="https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Thinking" target="_blank" rel="noopener">
                        ERNIE-4.5-VL-28B-A3B-Thinking
                    </a>
                </p>
                """
            )

        textbox = gr.MultimodalTextbox(
            show_label=False,
            placeholder="Enter text, or upload one or more images...",
            file_types=["image","video"],
            file_count="multiple"
        )
        chatbot = gr.Chatbot(
            type="messages",
            allow_tags=["think"],
            height=560,
            render_markdown=True,
            show_copy_button=True,
        )
        
        examples = [
            {
                "text": "这道题怎么解",
                "files": ["examples/case1.png"] 
            },
            {
                "text": "How many real people are actually in the picture?",
                "files": ["examples/case2.png"]
            },
        ]

        with gr.Column(elem_id="examples-panel"): 
            gr.Examples(
                examples=examples,
                inputs=textbox,
                label=None,
                examples_per_page=4,
                elem_id="examples-grid",
                fn=run_example,      # 点击示例时，直接走大模型
                outputs=chatbot,     
                run_on_click=True,   
            )

        with gr.Column(elem_id="chat-wrapper"):
            chat_interface = gr.ChatInterface(
                fn=stream_response,
                type="messages",
                multimodal=True,
                chatbot=chatbot,
                textbox=textbox,
            )


    return demo.queue(default_concurrency_limit=8)



if __name__ == "__main__":
    build_demo().launch()