Initial commit: snapAna 截图智能整理工具

包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 15:45:50 +08:00
commit 5c028d7952
76 changed files with 10467 additions and 0 deletions
@@ -0,0 +1,107 @@
+"""OpenAI 兼容 VLM 实现：覆盖 Ollama / GLM / MiniMax / Moonshot / OpenRouter / OpenAI。"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from app.core.logger import get_logger
+
+from .base import VLMProvider, VLMResult
+from .openai_vision_client import chat_completions, safe_parse_json
+
+
+logger = get_logger(__name__)
+
+
+_SYSTEM_PROMPT = """你是一个截图整理助手。用户会给你一张截图（可能附带 OCR 文本）。
+请用简洁的中文，按以下 JSON 结构返回分析结果，**只输出 JSON，不要解释**：
+
+{
+  "title": "一句话标题，不超过 24 个字",
+  "summary": "2-3 句话总结这张截图的内容、要点或笑点",
+  "category": "从给定分类列表中选一个最贴切的名字；如果都不符合就填'其他'",
+  "tags": ["3-6 个能帮助检索的细分标签"],
+  "todos": [
+     {"title": "如果截图里出现'待看/待读/待办/想试试/记一下'的内容，抽成一条 todo", "kind": "待读|待看|待办|学习", "note": "可空"}
+  ],
+  "suggestion": "可选：给用户的进一步行动建议或同类资源提示，可空"
+}
+
+要求：
+- 标题要可读，不要复述"这是一张..."。
+- summary 不要超过 80 字。
+- todos 没有可识别项时给空数组。"""
+
+
+class OpenAICompatVLM(VLMProvider):
+    """统一调用 /v1/chat/completions，图片以 base64 data URL 传入。"""
+
+    name = "openai_compat"
+
+    def __init__(
+        self,
+        base_url: str,
+        api_key: str,
+        model: str,
+        timeout: float = 60.0,
+    ) -> None:
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.model = model
+        self.timeout = timeout
+
+    async def analyze(
+        self,
+        image_path: Path,
+        ocr_text: str,
+        categories: list[str],
+        allow_upload: bool,
+    ) -> VLMResult:
+        """调用模型并解析结构化 JSON。"""
+        prompt = (
+            f"可选分类：{', '.join(categories)}\n\n"
+            f"OCR 文本（可能不完整或为空）：\n{ocr_text or '（无）'}"
+        )
+        content = await chat_completions(
+            base_url=self.base_url,
+            api_key=self.api_key,
+            model=self.model,
+            system_prompt=_SYSTEM_PROMPT,
+            user_text=prompt,
+            image_path=image_path if allow_upload else None,
+            allow_upload=allow_upload,
+            timeout=self.timeout,
+            json_mode=True,
+        )
+        parsed = safe_parse_json(content)
+        return _to_vlm_result(parsed)
+
+
+def _to_vlm_result(data: dict[str, Any]) -> VLMResult:
+    """JSON -> dataclass，容错地兜住字段。"""
+    todos_raw = data.get("todos") or []
+    todos: list[dict[str, str]] = []
+    if isinstance(todos_raw, list):
+        for item in todos_raw:
+            if isinstance(item, dict) and item.get("title"):
+                todos.append(
+                    {
+                        "title": str(item.get("title", ""))[:512],
+                        "kind": str(item.get("kind", "")) or "待办",
+                        "note": str(item.get("note", "") or ""),
+                    }
+                )
+            elif isinstance(item, str):
+                todos.append({"title": item, "kind": "待办", "note": ""})
+    tags_raw = data.get("tags") or []
+    if not isinstance(tags_raw, list):
+        tags_raw = []
+    return VLMResult(
+        title=str(data.get("title", "") or "")[:128],
+        summary=str(data.get("summary", "") or ""),
+        category=str(data.get("category") or "") or None,
+        tags=[str(t) for t in tags_raw if t][:8],
+        todos=todos,
+        suggestion=str(data.get("suggestion", "") or ""),
+        raw=data,
+    )