"""视觉大模型 OCR:用多模态 API 从截图中提取文字。""" from __future__ import annotations from pathlib import Path from .base import OCRProvider from .openai_vision_client import chat_completions, safe_parse_json _VISION_OCR_SYSTEM = """你是 OCR 助手。用户会给你一张截图,请尽可能完整地提取其中的文字。 只输出 JSON,格式:{"text": "提取到的全部文字,保留换行"} 如果没有可识别文字,text 填空字符串。""" class VisionOCR(OCRProvider): """OpenAI 兼容视觉模型识文(GLM-4V / GPT-4o / Qwen-VL / Ollama 等)。""" name = "vision" def __init__( self, base_url: str, api_key: str, model: str, timeout: float = 60.0, allow_upload: bool = True, ) -> None: self.base_url = base_url self.api_key = api_key self.model = model self.timeout = timeout self.allow_upload = allow_upload async def recognize(self, image_path: Path) -> str: """调用视觉模型提取文字。""" if not self.allow_upload: raise RuntimeError("敏感目录禁止上传图片,无法使用视觉 OCR") content = await chat_completions( base_url=self.base_url, api_key=self.api_key, model=self.model, system_prompt=_VISION_OCR_SYSTEM, user_text="请提取这张截图中的所有文字。", image_path=image_path, allow_upload=True, timeout=self.timeout, json_mode=True, ) parsed = safe_parse_json(content) text = parsed.get("text") or parsed.get("ocr_text") or content return str(text).strip()