File size: 12,278 Bytes
cf5d8cf
5ea0a9a
72ae5ae
 
 
 
cf5d8cf
72ae5ae
 
cf5d8cf
72ae5ae
d34c38d
f4cbb8a
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
d34c38d
72ae5ae
 
761812f
72ae5ae
 
 
 
 
cf5d8cf
0bec3bf
 
 
 
 
 
 
 
 
cf5d8cf
72ae5ae
 
cf5d8cf
72ae5ae
 
cf5d8cf
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
0bec3bf
72ae5ae
 
 
cf5d8cf
72ae5ae
 
 
cf5d8cf
72ae5ae
 
 
 
 
 
0bec3bf
b8aff8d
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
 
 
 
cf5d8cf
72ae5ae
 
bbe2eb3
72ae5ae
 
cf5d8cf
5ea0a9a
 
 
d34c38d
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ae5ae
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
72ae5ae
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
 
 
 
 
 
5ea0a9a
 
 
 
 
 
 
0bec3bf
 
 
 
 
 
 
5ea0a9a
0bec3bf
 
 
 
5ea0a9a
0bec3bf
 
 
5ea0a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bec3bf
 
 
 
cf5d8cf
 
 
 
64e01cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
import base64
import html
import mimetypes
import os
from pathlib import Path
from typing import Any, Dict, List

import gradio as gr
from openai import OpenAI

DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "ERNIE-4.5-VL-28B-A3B-Thinking")
BASE_URL = os.getenv("BASE_URL","")
api_key = os.getenv("ERNIE_API_KEY","")


CUSTOM_CSS = """
body {
    background: radial-gradient(circle at top, #fdfbff 0%, #e7ecf7 45%, #dfe6f5 100%);
    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
    color: #0f172a;
}
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto;
}
#ernie-hero {
    padding: 12px 0 4px;
}
#ernie-hero h1 {
    font-size: 1.85rem;
    margin-bottom: 0;
    font-weight: 500;
}
#model-link {
    margin-top: 6px;
    font-size: 0.95rem;
}
#model-link a {
    color: #4c1d95;
    text-decoration: none;
    font-weight: 500;
}
#model-link a:hover {
    text-decoration: underline;
}
#examples-panel {
    margin-top: 20px;
    padding: 18px 22px;
    border-radius: 18px;
    border: 1px solid rgba(15, 23, 42, 0.12);
    background: rgba(255, 255, 255, 0.92);
    box-shadow: 0 15px 35px rgba(15, 23, 42, 0.08);
    gap: 18px;
}
#examples-panel h4 {
    margin: 0 0 8px;
    font-size: 1.1rem;
    font-weight: 500;
}
#examples-panel p {
    margin: 0;
    color: rgba(15, 23, 42, 0.7);
    font-size: 0.95rem;
}
#examples-grid table {
    width: 100%;
}
#examples-grid table tbody {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
    gap: 12px;
}
#examples-grid table tr {
    display: block;
    background: #f7f9ff;
    border-radius: 14px;
    border: 1px solid rgba(15, 23, 42, 0.08);
    padding: 14px;
    box-shadow: 0 10px 28px rgba(15, 23, 42, 0.08);
}
#examples-grid table td {
    display: block;
    padding: 0;
}
#chat-wrapper {
    margin-top: 32px;
    border-radius: 24px;
    padding: 18px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(15, 23, 42, 0.1);
    box-shadow: 0 25px 60px rgba(15, 23, 42, 0.12);
}
.ernie-section {
    border-radius: 18px;
    margin-bottom: 14px;
    padding: 16px 18px;
    border: 1px solid rgba(15, 23, 42, 0.1);
    background: rgba(255, 255, 255, 0.95);
    box-shadow: 0 10px 24px rgba(15, 23, 42, 0.08);
}
.ernie-section-header {
    font-size: 0.85rem;
    text-transform: uppercase;
    letter-spacing: 0.08em;
    font-weight: 600;
    color: rgba(15, 23, 42, 0.65);
    display: flex;
    align-items: center;
    gap: 6px;
}
.ernie-section-body {
    margin-top: 10px;
    font-size: 1rem;
    color: rgba(15, 23, 42, 0.92);
    white-space: pre-wrap;
    line-height: 1.55;
}
.ernie-thinking {
    border-color: rgba(79, 70, 229, 0.35);
    background: rgba(129, 140, 248, 0.08);
}
.ernie-answer {
    border-color: rgba(16, 185, 129, 0.35);
    background: rgba(110, 231, 183, 0.08);
}

@media (prefers-color-scheme: dark) {
    body {
        background: radial-gradient(circle at top, #1f264b 0%, #0f172a 45%, #040713 100%);
        color: #ecf2ff;
    }
    #model-link a {
        color: #a5b4fc;
    }
    #examples-panel {
        border: 1px solid rgba(255, 255, 255, 0.05);
        background: rgba(8, 13, 30, 0.85);
        box-shadow: 0 15px 45px rgba(3, 7, 18, 0.55);
    }
    #examples-panel p {
        color: rgba(236, 242, 255, 0.75);
    }
    #examples-grid table tr {
        background: rgba(15, 23, 42, 0.7);
        border: 1px solid rgba(255, 255, 255, 0.04);
        box-shadow: 0 10px 30px rgba(4, 6, 15, 0.45);
    }
    #chat-wrapper {
        background: rgba(2, 6, 23, 0.78);
        border: 1px solid rgba(99, 102, 241, 0.25);
        box-shadow: 0 25px 70px rgba(2, 6, 23, 0.7);
    }
    .ernie-section {
        border: 1px solid rgba(255, 255, 255, 0.08);
        background: rgba(15, 23, 42, 0.85);
        box-shadow: 0 10px 30px rgba(2, 6, 23, 0.55);
    }
    .ernie-section-header {
        color: rgba(236, 242, 255, 0.75);
    }
    .ernie-section-body {
        color: rgba(248, 250, 255, 0.95);
    }
    .ernie-answer {
        border-color: rgba(45, 212, 191, 0.45);
        background: rgba(8, 47, 56, 0.65);
    }
    .ernie-thinking {
        border-color: rgba(165, 180, 252, 0.4);
        background: rgba(30, 27, 75, 0.65);
    }
}
"""

_client = OpenAI(
    base_url=BASE_URL,
    api_key=api_key,
)

def _data_url(path: str) -> str:
    mime, _ = mimetypes.guess_type(path)
    mime = mime or "application/octet-stream"
    data = base64.b64encode(Path(path).read_bytes()).decode("utf-8")
    return f"data:{mime};base64,{data}"

def _media_content(path: str) -> Dict[str, Any]:
    """支持图片和视频"""
    mime, _ = mimetypes.guess_type(path)
    if mime and mime.startswith("video"):
        # 视频格式
        return {"type": "video_url", "video_url": {"url": _data_url(path)}}
    else:
        # 图片格式(默认)
        return {"type": "image_url", "image_url": {"url": _data_url(path)}}

def _text_content(text: str) -> Dict[str, Any]:
    return {"type": "text", "text": text}

def _message(role: str, content: Any) -> Dict[str, Any]:
    return {"role": role, "content": content}

def _format_sections(thinking: str, answer: str | None = None) -> str:
    """Render Thinking/Answer blocks with HTML so the chatbot can style them."""
    def _build_block(kind: str, label: str, text: str, icon: str) -> str:
        text = (text or "").strip()
        if not text:
            return ""
        escaped = html.escape(text)
        return (
            f'<div class="ernie-section ernie-{kind}">'
            f'<div class="ernie-section-header">{icon} {label}</div>'
            f'<div class="ernie-section-body">{escaped}</div>'
            "</div>"
        )

    sections = [
        _build_block("thinking", "Thinking", thinking, "🧠"),
        _build_block("answer", "Answer", answer, "✨") if answer is not None else "",
    ]
    rendered = "".join(section for section in sections if section)
    return rendered

def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]:
    files = message.get("files") or []
    text = (message.get("text") or "").strip()
    content: List[Dict[str, Any]] = [_media_content(p) for p in files]
    if text:
        content.append(_text_content(text))
    return _message("user", content)

def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    msgs: List[Dict[str, Any]] = []
    user_content: List[Dict[str, Any]] = []
    
    for turn in history or []:
        role, content = turn.get("role"), turn.get("content")
        if role == "user":
            if isinstance(content, str):
                user_content.append(_text_content(content))
            elif isinstance(content, tuple):
                user_content.extend(_media_content(path) for path in content if path)
        elif role == "assistant":
            if "Answer:\n" in content:
                # 分割并仅保留Answer部分
                answer_only = content.split("Answer:\n", 1)[1].strip()
            else:
                # 兼容没有Thinking的情况
                answer_only = content.strip()

            if user_content:
                msgs.append(_message("user", user_content.copy()))
                user_content.clear()

            msgs.append(_message("assistant", [{"type": "text", "text": answer_only}]))
            
    return msgs


def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL):
    messages = _convert_history(history)
    messages.append(_build_user_message(message))
  
    try:
        stream = _client.chat.completions.create(
            model="default",
            messages=messages,
            stream=True
        )
        thinking_parts: List[str] = []
        answer_parts: List[str] = []
        answer_started = False

        for chunk in stream:
            delta = chunk.choices[0].delta

            if getattr(delta, "reasoning_content", None):
                thinking_parts.append(delta.reasoning_content)

            if getattr(delta, "content", None):
                answer_started = True
                answer_parts.append(delta.content)

            thinking_text = "".join(thinking_parts)
            answer_text = "".join(answer_parts) if answer_parts else None

            if answer_started:
                rendered = _format_sections(thinking_text, answer_text)
            else:
                rendered = _format_sections(thinking_text)

            if rendered:
                yield rendered

        if not answer_started and thinking_parts:
            # 流结束但模型未返回Answer时,至少保证Thinking被展示完全
            rendered = _format_sections("".join(thinking_parts))
            if rendered:
                yield rendered
    except Exception as e:
        yield f"Failed to get response: {e}"

def run_example(message: Dict[str, Any], history: List[Dict[str, Any]] | None = None):
    """
    用于 Examples 点击时直接走大模型。
    - 输入还是 ChatInterface 那种 message dict:{"text": ..., "files": [...]}
    - history 是 Chatbot 当前的消息列表(type="messages")
    - 输出改成 Chatbot 需要的消息列表:[{role, content}, ...]
    """
    history = history or []

    # 直接复用你现有的流式函数,只是把它返回的 HTML 包一层 messages
    for rendered in stream_response(message, history):
        # 这里只简单把 user 文本展示出来;图片就当“上下文里有了”,不专门渲染
        user_text = (message.get("text") or "").strip() or "[Example]"
        display_history = history + [
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": rendered},
        ]
        # 关键:对 Chatbot 来说,返回值要是「完整的消息列表」
        yield display_history

def build_demo() -> gr.Blocks:
    theme = gr.themes.Soft(primary_hue="violet", secondary_hue="cyan", neutral_hue="slate")

    with gr.Blocks(
        title="ERNIE-4.5-VL-28B-A3B-Thinking",
        theme=theme,
        css=CUSTOM_CSS,
    ) as demo:
        with gr.Column(elem_id="ernie-hero"):
            gr.Markdown(
                """
                <h1>Chat with ERNIE-4.5-VL-28B-A3B-Thinking</h1>
                """,
                elem_id="hero-text",
            )
            gr.Markdown(
                """
                <p id="model-link">
                    Model Repository:
                    <a href="https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Thinking" target="_blank" rel="noopener">
                        ERNIE-4.5-VL-28B-A3B-Thinking
                    </a>
                </p>
                """
            )

        textbox = gr.MultimodalTextbox(
            show_label=False,
            placeholder="Enter text, or upload one or more images...",
            file_types=["image","video"],
            file_count="multiple"
        )
        chatbot = gr.Chatbot(
            type="messages",
            allow_tags=["think"],
            height=560,
            render_markdown=True,
            show_copy_button=True,
        )
        
        examples = [
            {
                "text": "这道题怎么解",
                "files": ["examples/case1.png"] 
            },
            {
                "text": "How many real people are actually in the picture?",
                "files": ["examples/case2.png"]
            },
        ]

        with gr.Column(elem_id="examples-panel"): 
            gr.Examples(
                examples=examples,
                inputs=textbox,
                label=None,
                examples_per_page=4,
                elem_id="examples-grid",
                fn=run_example,      # 点击示例时,直接走大模型
                outputs=chatbot,     
                run_on_click=True,   
            )

        with gr.Column(elem_id="chat-wrapper"):
            chat_interface = gr.ChatInterface(
                fn=stream_response,
                type="messages",
                multimodal=True,
                chatbot=chatbot,
                textbox=textbox,
            )


    return demo.queue(default_concurrency_limit=8)



if __name__ == "__main__":
    build_demo().launch()