完整注释代码#

本篇保留项目的完整注释代码，方便和前面几篇模块分析对照阅读。阅读顺序建议是：先看 02_系统架构与回调流程、04_LangChain链路与会话历史、05_图像处理与多模态消息组装，再回到这里整体串读。
1
# =============================================================================
2
# 多模态聊天机器人（LangChain + ASR 版）
3
# 架构：Gradio 前端 → 消息预处理 → LangChain → 阿里云多模态大模型
4
# =============================================================================
5

6
import base64       # 二进制 → base64 字符串，用于 API 传输
7
import io           # 内存字节流，图片转换时不写临时文件
8
import time         # sensevoice 轮询时休眠
9
import httpx        # HTTP 客户端，调用阿里云原生 API
10
import gradio as gr
11
from PIL import Image
12
from pathlib import Path
13

14
from langchain_community.chat_message_histories import SQLChatMessageHistory
15
from langchain_core.messages import HumanMessage
16
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
17
from langchain_core.runnables import RunnableWithMessageHistory
18

19
from my_llm import multiModal_llm, asr_client
20

21

22
# ── 常量 ──────────────────────────────────────────────────────────────────────
23
# 原生多模态接口（支持 base64 音频）；compatible-mode 只支持公网 URL
24
DASHSCOPE_MULTIMODAL_URL = (
25
    "https://dashscope.aliyuncs.com/api/v1/services/aigc/"
26
    "multimodal-generation/generation"
27
)
28
DASHSCOPE_ASR_SUBMIT_URL = (
29
    "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription"
30
)
31
DASHSCOPE_TASK_QUERY_URL = "https://dashscope.aliyuncs.com/api/v1/tasks/{task_id}"
32
POLL_MAX_RETRIES = 20        # 最多轮询 20 次
33
POLL_INTERVAL_SECONDS = 2    # 每次间隔 2 秒，共最长等待 40 秒
34

35

36
# =============================================================================
37
# 语音识别：本地音频 → 文字
38
# 主方案：qwen3-asr-flash（同步）
39
# 降级方案：sensevoice-v1（异步轮询）
40
# =============================================================================
41

42
def transcribe_audio_to_text(audio_path: str) -> str:
43
    print(f"[正在识别语音] {audio_path}")
44

45
    # 步骤 1：音频文件 → Base64 Data URI
46
    # Data URI 格式：data:<MIME>;base64,<数据>，可内嵌在 JSON 里传输
47
    ext = Path(audio_path).suffix.lower().lstrip('.') or 'wav'
48
    fmt_map = {'m4a': 'mp4', 'ogg': 'ogg', 'flac': 'flac', 'mp3': 'mp3'}
49
    audio_fmt = fmt_map.get(ext, 'wav')
50
    mime_type = f"audio/{audio_fmt}"
51

52
    with open(audio_path, "rb") as f:
53
        audio_b64 = base64.b64encode(f.read()).decode('utf-8')
54
    data_uri = f"data:{mime_type};base64,{audio_b64}"
55

56
    api_key = asr_client.api_key  # 从已初始化的客户端取 key，不用重复配置
57

58
    # 步骤 2：主方案 —— qwen3-asr-flash（DashScope 原生多模态接口）
59
    # 注意：audio 字段必须是字符串（Data URI），不能是字典
60
    try:
61
        resp = httpx.post(
62
            DASHSCOPE_MULTIMODAL_URL,
63
            headers={"Authorization": f"Bearer {api_key}",
64
                     "Content-Type": "application/json"},
65
            json={
66
                "model": "qwen3-asr-flash",
67
                "input": {
68
                    "messages": [
69
                        {"role": "system", "content": [{"text": ""}]},  # 必须有，留空
70
                        {"role": "user",   "content": [{"audio": data_uri}]},
71
                    ]
72
                },
73
                "parameters": {"asr_options": {"enable_itn": False}},
74
            },
75
            timeout=30,
76
        )
77
        resp.raise_for_status()
78
        # 返回结构：output.choices[0].message.content[0].text
79
        text = resp.json()["output"]["choices"][0]["message"]["content"][0]["text"].strip()
80
        print(f"[识别结果]: {text}")
81
        return text
82

83
    except Exception as exc:
84
        print(f"\n[qwen3-asr-flash 失败] {exc}")
85
        print("[切换到 sensevoice-v1 异步接口...]")
86

87
    # 步骤 3：降级方案 —— sensevoice-v1（异步「提交-轮询」）
88
    try:
89
        # 第一步：提交任务，X-DashScope-Async: enable 是必须的请求头
90
        submit_resp = httpx.post(
91
            DASHSCOPE_ASR_SUBMIT_URL,
92
            headers={"Authorization": f"Bearer {api_key}",
93
                     "Content-Type": "application/json",
94
                     "X-DashScope-Async": "enable"},
95
            json={"model": "sensevoice-v1",
96
                  "input": {"file_url": data_uri},
97
                  "parameters": {}},
98
            timeout=15,
99
        )
100
        submit_resp.raise_for_status()
101
        task_id = submit_resp.json()["output"]["task_id"]
102
        print(f"[sensevoice 任务已提交] task_id={task_id}")
103

104
        # 第二步：轮询任务状态，直到完成或超时
105
        for attempt in range(POLL_MAX_RETRIES):
106
            time.sleep(POLL_INTERVAL_SECONDS)
107
            poll_resp = httpx.get(
108
                DASHSCOPE_TASK_QUERY_URL.format(task_id=task_id),
109
                headers={"Authorization": f"Bearer {api_key}"},
110
                timeout=10,
111
            )
112
            poll_resp.raise_for_status()
113
            poll_data = poll_resp.json()
114
            status = poll_data["output"]["task_status"]
115
            print(f"[轮询 {attempt + 1}/{POLL_MAX_RETRIES}] 状态: {status}")
116

117
            if status == "SUCCEEDED":
118
                text = poll_data["output"]["results"][0]["transcription"]
119
                print(f"[降级识别结果]: {text}")
120
                return text
121
            elif status in ("FAILED", "CANCELED"):
122
                print(f"[sensevoice 任务失败] {poll_data}")
123
                break
124

125
    except Exception as fallback_exc:
126
        print(f"[降级识别也失败] {fallback_exc}")
127

128
    return "［抱歉，系统未能听清您的语音内容］"
129

130

131
# =============================================================================
132
# LangChain 初始化
133
# =============================================================================
134

135
# Prompt 模板：system 设定人格，MessagesPlaceholder 是历史消息的占位符
136
# 运行时框架会把 SQLite 历史插入到 placeholder 位置
137
prompt = ChatPromptTemplate.from_messages([
138
    ('system', "你是一个强大的多模态AI助手，可以精准地处理文本和图像输入。"),
139
    MessagesPlaceholder(variable_name="messages"),
140
])
141

142
# 管道符 | 把 prompt 和 llm 串联：输入 → prompt渲染 → llm调用 → 输出
143
chain = prompt | multiModal_llm
144

145

146
def get_session_history(session_id: str) -> SQLChatMessageHistory:
147
    """框架在每次 invoke 前后自动调用此函数读写历史，无需手动调用"""
148
    return SQLChatMessageHistory(
149
        session_id=session_id,
150
        connection='sqlite:///chat_history.db',
151
    )
152

153

154
# 包装 chain，赋予「自动读写历史」能力
155
chain_history = RunnableWithMessageHistory(chain, get_session_history)
156

157
# 注意：硬编码 session_id 意味着所有用户共享历史
158
# 多用户场景应改为 str(uuid.uuid4()) 或根据用户 ID 生成
159
config = {"configurable": {"session_id": "usr000"}}
160

161

162
# =============================================================================
163
# 图像处理
164
# =============================================================================
165

166
def transcribe_image(image_path: str) -> dict | None:
167
    """图片文件 → OpenAI 兼容的 image_url 格式字典"""
168
    try:
169
        with Image.open(image_path) as img:
170
            if img.mode in ('RGBA', 'P'):
171
                img = img.convert('RGB')  # RGBA/P 模式不支持 JPEG，统一转 RGB
172
            img_format = img.format or 'JPEG'
173
            buffered = io.BytesIO()       # 内存流，避免写临时文件
174
            img.save(buffered, format=img_format)
175
            image_data = base64.b64encode(buffered.getvalue()).decode('utf-8')
176
            return {
177
                "type": "image_url",
178
                "image_url": {
179
                    "url": f"data:image/{img_format.lower()};base64,{image_data}",
180
                    "detail": "high",  # high: 高精度，按尺寸计费；low: 固定85token
181
                },
182
            }
183
    except Exception as e:
184
        print(f"[图像处理失败] {e}")
185
        return None
186

187

188
def _append_file(content: list, file_path: str) -> None:
189
    """根据后缀追加文件到 content 列表（音频已在前端拦截，这里只剩图片）"""
190
    ext = Path(file_path).suffix.lower()
191
    if ext in ('.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif'):
192
        msg = transcribe_image(file_path)
193
        if msg:
194
            content.append(msg)
195
    else:
196
        print(f"[警告] 不支持的文件类型: {file_path}")
197

198

199
# =============================================================================
200
# 消息组装
201
# =============================================================================
202

203
def get_last_user_messages(history: list) -> list:
204
    """
205
    提取「最后一个 assistant 回复之后」的所有 user 消息（即本轮新增消息）。
206
    避免把完整历史重复传给 LangChain（历史已在 SQLite 里，框架自动加载）。
207
    """
208
    if not history or history[-1]["role"] == "assistant":
209
        return []
210
    last_assistant_idx = next(
211
        (i for i in range(len(history) - 1, -1, -1)
212
         if history[i]["role"] == "assistant"),
213
        -1,
214
    )
215
    return history[last_assistant_idx + 1:]
216

217

218
# =============================================================================
219
# Gradio 回调
220
# =============================================================================
221

222
def add_message(history: list, messages: dict) -> tuple:
223
    """
224
    第一步回调：处理用户输入，立即更新界面。
225
    - 音频：转文字后存入 history（不让二进制音频流入 LangChain）
226
    - 图片：存文件路径，供 submit_messages 读取编码
227
    - 文字：直接存入 history
228
    """
229
    for file_path in messages.get('files', []):
230
        print(f"[UI] 收到文件: {file_path}")
231
        ext = Path(file_path).suffix.lower()
232
        if ext in ('.wav', '.mp3', '.m4a', '.ogg', '.flac'):
233
            text = transcribe_audio_to_text(file_path)
234
            if text:
235
                history.append({'role': 'user', 'content': f"🎤 [语音输入]: {text}"})
236
        else:
237
            # type='messages' 模式下图片必须用字典格式，不能用 tuple
238
            history.append({'role': 'user', 'content': {'path': file_path}})
239

240
    text_input = messages.get("text", "").strip()
241
    if text_input:
242
        history.append({"role": "user", "content": text_input})
243

244
    # 返回更新后的 history 和清空+禁用的输入框
245
    return history, gr.MultimodalTextbox(value=None, interactive=False)
246

247

248
def submit_messages(history: list) -> list:
249
    """
250
    第二步回调：组装本轮消息，调用 LangChain 获取 AI 回复。
251
    处理三种 content 格式：str / list（新版Gradio文本）/ dict（图片路径）
252
    """
253
    user_messages = get_last_user_messages(history)
254
    content = []
255
    has_text = False
256

257
    for x in user_messages:
258
        msg_content = x['content']
259

260
        if isinstance(msg_content, str):
261
            content.append({'type': 'text', 'text': msg_content})
262
            has_text = True
263

264
        elif isinstance(msg_content, list):
265
            for item in msg_content:
266
                if not isinstance(item, dict):
267
                    continue
268
                if item.get('type') == 'text':
269
                    content.append({'type': 'text', 'text': item['text']})
270
                    has_text = True
271
                elif item.get('type') == 'file':
272
                    fp = item.get('file', {}).get('path')
273
                    if fp:
274
                        _append_file(content, fp)
275

276
        elif isinstance(msg_content, dict):
277
            fp = msg_content.get('path') or msg_content.get('url') or msg_content.get('name')
278
            if fp:
279
                _append_file(content, fp)
280

281
    # 只有图片没有文字时，自动补一条描述指令（避免部分模型报错）
282
    if content and not has_text:
283
        content.append({'type': 'text', 'text': '请详细描述这张图片的内容。'})
284

285
    if not content:
286
        print("[警告] content 为空，跳过本次调用")
287
        return history
288

289
    input_message = HumanMessage(content=content)
290

291
    try:
292
        # invoke 三步：① 读SQLite历史 ② 调用模型 ③ 写回历史
293
        resp = chain_history.invoke({'messages': [input_message]}, config)
294
        history.append({'role': 'assistant', 'content': resp.content})
295
    except Exception as e:
296
        print(f"\n[调用大模型失败] {e}\n")
297
        history.append({'role': 'assistant', 'content': f"⚠️ 请求出错：{e}"})
298

299
    return history
300

301

302
# =============================================================================
303
# Gradio 界面
304
# =============================================================================
305

306
with gr.Blocks(title='多模态聊天机器人') as block:
307
    chatbot = gr.Chatbot(height=600, label='AI 助手')
308

309
    chat_input = gr.MultimodalTextbox(
310
        interactive=True,
311
        file_types=['image', 'audio'],
312
        file_count="multiple",
313
        placeholder="请输入文字，或点击 📎 上传图片/语音...",
314
        show_label=False,
315
        sources=["microphone", "upload"],
316
    )
317

318
    # 三步事件链：
319
    # 1. add_message    → 立即显示用户消息，锁定输入框
320
    # 2. submit_messages → 调用 AI，追加回复
321
    # 3. lambda          → 解锁输入框
322
    chat_input.submit(
323
        fn=add_message,
324
        inputs=[chatbot, chat_input],
325
        outputs=[chatbot, chat_input],
326
    ).then(
327
        fn=submit_messages,
328
        inputs=[chatbot],
329
        outputs=[chatbot],
330
    ).then(
331
        fn=lambda: gr.MultimodalTextbox(interactive=True),
332
        inputs=None,
333
        outputs=[chat_input],
334
    )
335

336
if __name__ == '__main__':
337
    block.launch(theme=gr.themes.Soft())