Files
SnapAndAnaly/backend/app/providers/ocr_vision.py
T
congsh 5c028d7952 Initial commit: snapAna 截图智能整理工具
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 15:45:50 +08:00

53 lines
1.7 KiB
Python

"""视觉大模型 OCR:用多模态 API 从截图中提取文字。"""
from __future__ import annotations
from pathlib import Path
from .base import OCRProvider
from .openai_vision_client import chat_completions, safe_parse_json
_VISION_OCR_SYSTEM = """你是 OCR 助手。用户会给你一张截图,请尽可能完整地提取其中的文字。
只输出 JSON,格式:{"text": "提取到的全部文字,保留换行"}
如果没有可识别文字,text 填空字符串。"""
class VisionOCR(OCRProvider):
"""OpenAI 兼容视觉模型识文(GLM-4V / GPT-4o / Qwen-VL / Ollama 等)。"""
name = "vision"
def __init__(
self,
base_url: str,
api_key: str,
model: str,
timeout: float = 60.0,
allow_upload: bool = True,
) -> None:
self.base_url = base_url
self.api_key = api_key
self.model = model
self.timeout = timeout
self.allow_upload = allow_upload
async def recognize(self, image_path: Path) -> str:
"""调用视觉模型提取文字。"""
if not self.allow_upload:
raise RuntimeError("敏感目录禁止上传图片,无法使用视觉 OCR")
content = await chat_completions(
base_url=self.base_url,
api_key=self.api_key,
model=self.model,
system_prompt=_VISION_OCR_SYSTEM,
user_text="请提取这张截图中的所有文字。",
image_path=image_path,
allow_upload=True,
timeout=self.timeout,
json_mode=True,
)
parsed = safe_parse_json(content)
text = parsed.get("text") or parsed.get("ocr_text") or content
return str(text).strip()