Files
SnapAndAnaly/backend/app/providers/vlm_openai.py
T
congsh 5c028d7952 Initial commit: snapAna 截图智能整理工具
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 15:45:50 +08:00

108 lines
3.7 KiB
Python

"""OpenAI 兼容 VLM 实现:覆盖 Ollama / GLM / MiniMax / Moonshot / OpenRouter / OpenAI。"""
from __future__ import annotations
from pathlib import Path
from typing import Any
from app.core.logger import get_logger
from .base import VLMProvider, VLMResult
from .openai_vision_client import chat_completions, safe_parse_json
logger = get_logger(__name__)
_SYSTEM_PROMPT = """你是一个截图整理助手。用户会给你一张截图(可能附带 OCR 文本)。
请用简洁的中文,按以下 JSON 结构返回分析结果,**只输出 JSON,不要解释**:
{
"title": "一句话标题,不超过 24 个字",
"summary": "2-3 句话总结这张截图的内容、要点或笑点",
"category": "从给定分类列表中选一个最贴切的名字;如果都不符合就填'其他'",
"tags": ["3-6 个能帮助检索的细分标签"],
"todos": [
{"title": "如果截图里出现'待看/待读/待办/想试试/记一下'的内容,抽成一条 todo", "kind": "待读|待看|待办|学习", "note": "可空"}
],
"suggestion": "可选:给用户的进一步行动建议或同类资源提示,可空"
}
要求:
- 标题要可读,不要复述"这是一张..."
- summary 不要超过 80 字。
- todos 没有可识别项时给空数组。"""
class OpenAICompatVLM(VLMProvider):
"""统一调用 /v1/chat/completions,图片以 base64 data URL 传入。"""
name = "openai_compat"
def __init__(
self,
base_url: str,
api_key: str,
model: str,
timeout: float = 60.0,
) -> None:
self.base_url = base_url.rstrip("/")
self.api_key = api_key
self.model = model
self.timeout = timeout
async def analyze(
self,
image_path: Path,
ocr_text: str,
categories: list[str],
allow_upload: bool,
) -> VLMResult:
"""调用模型并解析结构化 JSON。"""
prompt = (
f"可选分类:{', '.join(categories)}\n\n"
f"OCR 文本(可能不完整或为空):\n{ocr_text or '(无)'}"
)
content = await chat_completions(
base_url=self.base_url,
api_key=self.api_key,
model=self.model,
system_prompt=_SYSTEM_PROMPT,
user_text=prompt,
image_path=image_path if allow_upload else None,
allow_upload=allow_upload,
timeout=self.timeout,
json_mode=True,
)
parsed = safe_parse_json(content)
return _to_vlm_result(parsed)
def _to_vlm_result(data: dict[str, Any]) -> VLMResult:
"""JSON -> dataclass,容错地兜住字段。"""
todos_raw = data.get("todos") or []
todos: list[dict[str, str]] = []
if isinstance(todos_raw, list):
for item in todos_raw:
if isinstance(item, dict) and item.get("title"):
todos.append(
{
"title": str(item.get("title", ""))[:512],
"kind": str(item.get("kind", "")) or "待办",
"note": str(item.get("note", "") or ""),
}
)
elif isinstance(item, str):
todos.append({"title": item, "kind": "待办", "note": ""})
tags_raw = data.get("tags") or []
if not isinstance(tags_raw, list):
tags_raw = []
return VLMResult(
title=str(data.get("title", "") or "")[:128],
summary=str(data.get("summary", "") or ""),
category=str(data.get("category") or "") or None,
tags=[str(t) for t in tags_raw if t][:8],
todos=todos,
suggestion=str(data.get("suggestion", "") or ""),
raw=data,
)