5c028d7952
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。 Co-authored-by: Cursor <cursoragent@cursor.com>
53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
"""视觉大模型 OCR:用多模态 API 从截图中提取文字。"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from .base import OCRProvider
|
|
from .openai_vision_client import chat_completions, safe_parse_json
|
|
|
|
|
|
_VISION_OCR_SYSTEM = """你是 OCR 助手。用户会给你一张截图,请尽可能完整地提取其中的文字。
|
|
只输出 JSON,格式:{"text": "提取到的全部文字,保留换行"}
|
|
如果没有可识别文字,text 填空字符串。"""
|
|
|
|
|
|
class VisionOCR(OCRProvider):
|
|
"""OpenAI 兼容视觉模型识文(GLM-4V / GPT-4o / Qwen-VL / Ollama 等)。"""
|
|
|
|
name = "vision"
|
|
|
|
def __init__(
|
|
self,
|
|
base_url: str,
|
|
api_key: str,
|
|
model: str,
|
|
timeout: float = 60.0,
|
|
allow_upload: bool = True,
|
|
) -> None:
|
|
self.base_url = base_url
|
|
self.api_key = api_key
|
|
self.model = model
|
|
self.timeout = timeout
|
|
self.allow_upload = allow_upload
|
|
|
|
async def recognize(self, image_path: Path) -> str:
|
|
"""调用视觉模型提取文字。"""
|
|
if not self.allow_upload:
|
|
raise RuntimeError("敏感目录禁止上传图片,无法使用视觉 OCR")
|
|
|
|
content = await chat_completions(
|
|
base_url=self.base_url,
|
|
api_key=self.api_key,
|
|
model=self.model,
|
|
system_prompt=_VISION_OCR_SYSTEM,
|
|
user_text="请提取这张截图中的所有文字。",
|
|
image_path=image_path,
|
|
allow_upload=True,
|
|
timeout=self.timeout,
|
|
json_mode=True,
|
|
)
|
|
parsed = safe_parse_json(content)
|
|
text = parsed.get("text") or parsed.get("ocr_text") or content
|
|
return str(text).strip()
|