"""OpenAI 兼容 VLM 实现:覆盖 Ollama / GLM / MiniMax / Moonshot / OpenRouter / OpenAI。""" from __future__ import annotations from pathlib import Path from typing import Any from app.core.logger import get_logger from .base import VLMProvider, VLMResult from .openai_vision_client import chat_completions, safe_parse_json logger = get_logger(__name__) _SYSTEM_PROMPT = """你是一个截图整理助手。用户会给你一张截图(可能附带 OCR 文本)。 请用简洁的中文,按以下 JSON 结构返回分析结果,**只输出 JSON,不要解释**: { "title": "一句话标题,不超过 24 个字", "summary": "2-3 句话总结这张截图的内容、要点或笑点", "category": "从给定分类列表中选一个最贴切的名字;如果都不符合就填'其他'", "tags": ["3-6 个能帮助检索的细分标签"], "todos": [ {"title": "如果截图里出现'待看/待读/待办/想试试/记一下'的内容,抽成一条 todo", "kind": "待读|待看|待办|学习", "note": "可空"} ], "suggestion": "可选:给用户的进一步行动建议或同类资源提示,可空" } 要求: - 标题要可读,不要复述"这是一张..."。 - summary 不要超过 80 字。 - todos 没有可识别项时给空数组。""" class OpenAICompatVLM(VLMProvider): """统一调用 /v1/chat/completions,图片以 base64 data URL 传入。""" name = "openai_compat" def __init__( self, base_url: str, api_key: str, model: str, timeout: float = 60.0, ) -> None: self.base_url = base_url.rstrip("/") self.api_key = api_key self.model = model self.timeout = timeout async def analyze( self, image_path: Path, ocr_text: str, categories: list[str], allow_upload: bool, ) -> VLMResult: """调用模型并解析结构化 JSON。""" prompt = ( f"可选分类:{', '.join(categories)}\n\n" f"OCR 文本(可能不完整或为空):\n{ocr_text or '(无)'}" ) content = await chat_completions( base_url=self.base_url, api_key=self.api_key, model=self.model, system_prompt=_SYSTEM_PROMPT, user_text=prompt, image_path=image_path if allow_upload else None, allow_upload=allow_upload, timeout=self.timeout, json_mode=True, ) parsed = safe_parse_json(content) return _to_vlm_result(parsed) def _to_vlm_result(data: dict[str, Any]) -> VLMResult: """JSON -> dataclass,容错地兜住字段。""" todos_raw = data.get("todos") or [] todos: list[dict[str, str]] = [] if isinstance(todos_raw, list): for item in todos_raw: if isinstance(item, dict) and item.get("title"): todos.append( { "title": str(item.get("title", ""))[:512], "kind": str(item.get("kind", "")) or "待办", "note": str(item.get("note", "") or ""), } ) elif isinstance(item, str): todos.append({"title": item, "kind": "待办", "note": ""}) tags_raw = data.get("tags") or [] if not isinstance(tags_raw, list): tags_raw = [] return VLMResult( title=str(data.get("title", "") or "")[:128], summary=str(data.get("summary", "") or ""), category=str(data.get("category") or "") or None, tags=[str(t) for t in tags_raw if t][:8], todos=todos, suggestion=str(data.get("suggestion", "") or ""), raw=data, )