5c028d7952
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。 Co-authored-by: Cursor <cursoragent@cursor.com>
108 lines
3.7 KiB
Python
108 lines
3.7 KiB
Python
"""OpenAI 兼容 VLM 实现:覆盖 Ollama / GLM / MiniMax / Moonshot / OpenRouter / OpenAI。"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from app.core.logger import get_logger
|
|
|
|
from .base import VLMProvider, VLMResult
|
|
from .openai_vision_client import chat_completions, safe_parse_json
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
_SYSTEM_PROMPT = """你是一个截图整理助手。用户会给你一张截图(可能附带 OCR 文本)。
|
|
请用简洁的中文,按以下 JSON 结构返回分析结果,**只输出 JSON,不要解释**:
|
|
|
|
{
|
|
"title": "一句话标题,不超过 24 个字",
|
|
"summary": "2-3 句话总结这张截图的内容、要点或笑点",
|
|
"category": "从给定分类列表中选一个最贴切的名字;如果都不符合就填'其他'",
|
|
"tags": ["3-6 个能帮助检索的细分标签"],
|
|
"todos": [
|
|
{"title": "如果截图里出现'待看/待读/待办/想试试/记一下'的内容,抽成一条 todo", "kind": "待读|待看|待办|学习", "note": "可空"}
|
|
],
|
|
"suggestion": "可选:给用户的进一步行动建议或同类资源提示,可空"
|
|
}
|
|
|
|
要求:
|
|
- 标题要可读,不要复述"这是一张..."。
|
|
- summary 不要超过 80 字。
|
|
- todos 没有可识别项时给空数组。"""
|
|
|
|
|
|
class OpenAICompatVLM(VLMProvider):
|
|
"""统一调用 /v1/chat/completions,图片以 base64 data URL 传入。"""
|
|
|
|
name = "openai_compat"
|
|
|
|
def __init__(
|
|
self,
|
|
base_url: str,
|
|
api_key: str,
|
|
model: str,
|
|
timeout: float = 60.0,
|
|
) -> None:
|
|
self.base_url = base_url.rstrip("/")
|
|
self.api_key = api_key
|
|
self.model = model
|
|
self.timeout = timeout
|
|
|
|
async def analyze(
|
|
self,
|
|
image_path: Path,
|
|
ocr_text: str,
|
|
categories: list[str],
|
|
allow_upload: bool,
|
|
) -> VLMResult:
|
|
"""调用模型并解析结构化 JSON。"""
|
|
prompt = (
|
|
f"可选分类:{', '.join(categories)}\n\n"
|
|
f"OCR 文本(可能不完整或为空):\n{ocr_text or '(无)'}"
|
|
)
|
|
content = await chat_completions(
|
|
base_url=self.base_url,
|
|
api_key=self.api_key,
|
|
model=self.model,
|
|
system_prompt=_SYSTEM_PROMPT,
|
|
user_text=prompt,
|
|
image_path=image_path if allow_upload else None,
|
|
allow_upload=allow_upload,
|
|
timeout=self.timeout,
|
|
json_mode=True,
|
|
)
|
|
parsed = safe_parse_json(content)
|
|
return _to_vlm_result(parsed)
|
|
|
|
|
|
def _to_vlm_result(data: dict[str, Any]) -> VLMResult:
|
|
"""JSON -> dataclass,容错地兜住字段。"""
|
|
todos_raw = data.get("todos") or []
|
|
todos: list[dict[str, str]] = []
|
|
if isinstance(todos_raw, list):
|
|
for item in todos_raw:
|
|
if isinstance(item, dict) and item.get("title"):
|
|
todos.append(
|
|
{
|
|
"title": str(item.get("title", ""))[:512],
|
|
"kind": str(item.get("kind", "")) or "待办",
|
|
"note": str(item.get("note", "") or ""),
|
|
}
|
|
)
|
|
elif isinstance(item, str):
|
|
todos.append({"title": item, "kind": "待办", "note": ""})
|
|
tags_raw = data.get("tags") or []
|
|
if not isinstance(tags_raw, list):
|
|
tags_raw = []
|
|
return VLMResult(
|
|
title=str(data.get("title", "") or "")[:128],
|
|
summary=str(data.get("summary", "") or ""),
|
|
category=str(data.get("category") or "") or None,
|
|
tags=[str(t) for t in tags_raw if t][:8],
|
|
todos=todos,
|
|
suggestion=str(data.get("suggestion", "") or ""),
|
|
raw=data,
|
|
)
|